docling 2.57.0__py3-none-any.whl → 2.58.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

@@ -10,13 +10,394 @@ from docling.datamodel.pipeline_options_asr_model import (
10
10
  # AsrResponseFormat,
11
11
  # ApiAsrOptions,
12
12
  InferenceAsrFramework,
13
+ InlineAsrMlxWhisperOptions,
13
14
  InlineAsrNativeWhisperOptions,
14
15
  TransformersModelType,
15
16
  )
16
17
 
17
18
  _log = logging.getLogger(__name__)
18
19
 
19
- WHISPER_TINY = InlineAsrNativeWhisperOptions(
20
+
21
+ def _get_whisper_tiny_model():
22
+ """
23
+ Get the best Whisper Tiny model for the current hardware.
24
+
25
+ Automatically selects MLX Whisper Tiny for Apple Silicon (MPS) if available,
26
+ otherwise falls back to native Whisper Tiny.
27
+ """
28
+ # Check if MPS is available (Apple Silicon)
29
+ try:
30
+ import torch
31
+
32
+ has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
33
+ except ImportError:
34
+ has_mps = False
35
+
36
+ # Check if mlx-whisper is available
37
+ try:
38
+ import mlx_whisper # type: ignore
39
+
40
+ has_mlx_whisper = True
41
+ except ImportError:
42
+ has_mlx_whisper = False
43
+
44
+ # Use MLX Whisper if both MPS and mlx-whisper are available
45
+ if has_mps and has_mlx_whisper:
46
+ return InlineAsrMlxWhisperOptions(
47
+ repo_id="mlx-community/whisper-tiny-mlx",
48
+ inference_framework=InferenceAsrFramework.MLX,
49
+ language="en",
50
+ task="transcribe",
51
+ word_timestamps=True,
52
+ no_speech_threshold=0.6,
53
+ logprob_threshold=-1.0,
54
+ compression_ratio_threshold=2.4,
55
+ )
56
+ else:
57
+ return InlineAsrNativeWhisperOptions(
58
+ repo_id="tiny",
59
+ inference_framework=InferenceAsrFramework.WHISPER,
60
+ verbose=True,
61
+ timestamps=True,
62
+ word_timestamps=True,
63
+ temperature=0.0,
64
+ max_new_tokens=256,
65
+ max_time_chunk=30.0,
66
+ )
67
+
68
+
69
+ # Create the model instance
70
+ WHISPER_TINY = _get_whisper_tiny_model()
71
+
72
+
73
+ def _get_whisper_small_model():
74
+ """
75
+ Get the best Whisper Small model for the current hardware.
76
+
77
+ Automatically selects MLX Whisper Small for Apple Silicon (MPS) if available,
78
+ otherwise falls back to native Whisper Small.
79
+ """
80
+ # Check if MPS is available (Apple Silicon)
81
+ try:
82
+ import torch
83
+
84
+ has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
85
+ except ImportError:
86
+ has_mps = False
87
+
88
+ # Check if mlx-whisper is available
89
+ try:
90
+ import mlx_whisper # type: ignore
91
+
92
+ has_mlx_whisper = True
93
+ except ImportError:
94
+ has_mlx_whisper = False
95
+
96
+ # Use MLX Whisper if both MPS and mlx-whisper are available
97
+ if has_mps and has_mlx_whisper:
98
+ return InlineAsrMlxWhisperOptions(
99
+ repo_id="mlx-community/whisper-small-mlx",
100
+ inference_framework=InferenceAsrFramework.MLX,
101
+ language="en",
102
+ task="transcribe",
103
+ word_timestamps=True,
104
+ no_speech_threshold=0.6,
105
+ logprob_threshold=-1.0,
106
+ compression_ratio_threshold=2.4,
107
+ )
108
+ else:
109
+ return InlineAsrNativeWhisperOptions(
110
+ repo_id="small",
111
+ inference_framework=InferenceAsrFramework.WHISPER,
112
+ verbose=True,
113
+ timestamps=True,
114
+ word_timestamps=True,
115
+ temperature=0.0,
116
+ max_new_tokens=256,
117
+ max_time_chunk=30.0,
118
+ )
119
+
120
+
121
+ # Create the model instance
122
+ WHISPER_SMALL = _get_whisper_small_model()
123
+
124
+
125
+ def _get_whisper_medium_model():
126
+ """
127
+ Get the best Whisper Medium model for the current hardware.
128
+
129
+ Automatically selects MLX Whisper Medium for Apple Silicon (MPS) if available,
130
+ otherwise falls back to native Whisper Medium.
131
+ """
132
+ # Check if MPS is available (Apple Silicon)
133
+ try:
134
+ import torch
135
+
136
+ has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
137
+ except ImportError:
138
+ has_mps = False
139
+
140
+ # Check if mlx-whisper is available
141
+ try:
142
+ import mlx_whisper # type: ignore
143
+
144
+ has_mlx_whisper = True
145
+ except ImportError:
146
+ has_mlx_whisper = False
147
+
148
+ # Use MLX Whisper if both MPS and mlx-whisper are available
149
+ if has_mps and has_mlx_whisper:
150
+ return InlineAsrMlxWhisperOptions(
151
+ repo_id="mlx-community/whisper-medium-mlx-8bit",
152
+ inference_framework=InferenceAsrFramework.MLX,
153
+ language="en",
154
+ task="transcribe",
155
+ word_timestamps=True,
156
+ no_speech_threshold=0.6,
157
+ logprob_threshold=-1.0,
158
+ compression_ratio_threshold=2.4,
159
+ )
160
+ else:
161
+ return InlineAsrNativeWhisperOptions(
162
+ repo_id="medium",
163
+ inference_framework=InferenceAsrFramework.WHISPER,
164
+ verbose=True,
165
+ timestamps=True,
166
+ word_timestamps=True,
167
+ temperature=0.0,
168
+ max_new_tokens=256,
169
+ max_time_chunk=30.0,
170
+ )
171
+
172
+
173
+ # Create the model instance
174
+ WHISPER_MEDIUM = _get_whisper_medium_model()
175
+
176
+
177
+ def _get_whisper_base_model():
178
+ """
179
+ Get the best Whisper Base model for the current hardware.
180
+
181
+ Automatically selects MLX Whisper Base for Apple Silicon (MPS) if available,
182
+ otherwise falls back to native Whisper Base.
183
+ """
184
+ # Check if MPS is available (Apple Silicon)
185
+ try:
186
+ import torch
187
+
188
+ has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
189
+ except ImportError:
190
+ has_mps = False
191
+
192
+ # Check if mlx-whisper is available
193
+ try:
194
+ import mlx_whisper # type: ignore
195
+
196
+ has_mlx_whisper = True
197
+ except ImportError:
198
+ has_mlx_whisper = False
199
+
200
+ # Use MLX Whisper if both MPS and mlx-whisper are available
201
+ if has_mps and has_mlx_whisper:
202
+ return InlineAsrMlxWhisperOptions(
203
+ repo_id="mlx-community/whisper-base-mlx",
204
+ inference_framework=InferenceAsrFramework.MLX,
205
+ language="en",
206
+ task="transcribe",
207
+ word_timestamps=True,
208
+ no_speech_threshold=0.6,
209
+ logprob_threshold=-1.0,
210
+ compression_ratio_threshold=2.4,
211
+ )
212
+ else:
213
+ return InlineAsrNativeWhisperOptions(
214
+ repo_id="base",
215
+ inference_framework=InferenceAsrFramework.WHISPER,
216
+ verbose=True,
217
+ timestamps=True,
218
+ word_timestamps=True,
219
+ temperature=0.0,
220
+ max_new_tokens=256,
221
+ max_time_chunk=30.0,
222
+ )
223
+
224
+
225
+ # Create the model instance
226
+ WHISPER_BASE = _get_whisper_base_model()
227
+
228
+
229
+ def _get_whisper_large_model():
230
+ """
231
+ Get the best Whisper Large model for the current hardware.
232
+
233
+ Automatically selects MLX Whisper Large for Apple Silicon (MPS) if available,
234
+ otherwise falls back to native Whisper Large.
235
+ """
236
+ # Check if MPS is available (Apple Silicon)
237
+ try:
238
+ import torch
239
+
240
+ has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
241
+ except ImportError:
242
+ has_mps = False
243
+
244
+ # Check if mlx-whisper is available
245
+ try:
246
+ import mlx_whisper # type: ignore
247
+
248
+ has_mlx_whisper = True
249
+ except ImportError:
250
+ has_mlx_whisper = False
251
+
252
+ # Use MLX Whisper if both MPS and mlx-whisper are available
253
+ if has_mps and has_mlx_whisper:
254
+ return InlineAsrMlxWhisperOptions(
255
+ repo_id="mlx-community/whisper-large-mlx-8bit",
256
+ inference_framework=InferenceAsrFramework.MLX,
257
+ language="en",
258
+ task="transcribe",
259
+ word_timestamps=True,
260
+ no_speech_threshold=0.6,
261
+ logprob_threshold=-1.0,
262
+ compression_ratio_threshold=2.4,
263
+ )
264
+ else:
265
+ return InlineAsrNativeWhisperOptions(
266
+ repo_id="large",
267
+ inference_framework=InferenceAsrFramework.WHISPER,
268
+ verbose=True,
269
+ timestamps=True,
270
+ word_timestamps=True,
271
+ temperature=0.0,
272
+ max_new_tokens=256,
273
+ max_time_chunk=30.0,
274
+ )
275
+
276
+
277
+ # Create the model instance
278
+ WHISPER_LARGE = _get_whisper_large_model()
279
+
280
+
281
+ def _get_whisper_turbo_model():
282
+ """
283
+ Get the best Whisper Turbo model for the current hardware.
284
+
285
+ Automatically selects MLX Whisper Turbo for Apple Silicon (MPS) if available,
286
+ otherwise falls back to native Whisper Turbo.
287
+ """
288
+ # Check if MPS is available (Apple Silicon)
289
+ try:
290
+ import torch
291
+
292
+ has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
293
+ except ImportError:
294
+ has_mps = False
295
+
296
+ # Check if mlx-whisper is available
297
+ try:
298
+ import mlx_whisper # type: ignore
299
+
300
+ has_mlx_whisper = True
301
+ except ImportError:
302
+ has_mlx_whisper = False
303
+
304
+ # Use MLX Whisper if both MPS and mlx-whisper are available
305
+ if has_mps and has_mlx_whisper:
306
+ return InlineAsrMlxWhisperOptions(
307
+ repo_id="mlx-community/whisper-turbo",
308
+ inference_framework=InferenceAsrFramework.MLX,
309
+ language="en",
310
+ task="transcribe",
311
+ word_timestamps=True,
312
+ no_speech_threshold=0.6,
313
+ logprob_threshold=-1.0,
314
+ compression_ratio_threshold=2.4,
315
+ )
316
+ else:
317
+ return InlineAsrNativeWhisperOptions(
318
+ repo_id="turbo",
319
+ inference_framework=InferenceAsrFramework.WHISPER,
320
+ verbose=True,
321
+ timestamps=True,
322
+ word_timestamps=True,
323
+ temperature=0.0,
324
+ max_new_tokens=256,
325
+ max_time_chunk=30.0,
326
+ )
327
+
328
+
329
+ # Create the model instance
330
+ WHISPER_TURBO = _get_whisper_turbo_model()
331
+
332
+ # Explicit MLX Whisper model options for users who want to force MLX usage
333
+ WHISPER_TINY_MLX = InlineAsrMlxWhisperOptions(
334
+ repo_id="mlx-community/whisper-tiny-mlx",
335
+ inference_framework=InferenceAsrFramework.MLX,
336
+ language="en",
337
+ task="transcribe",
338
+ word_timestamps=True,
339
+ no_speech_threshold=0.6,
340
+ logprob_threshold=-1.0,
341
+ compression_ratio_threshold=2.4,
342
+ )
343
+
344
+ WHISPER_SMALL_MLX = InlineAsrMlxWhisperOptions(
345
+ repo_id="mlx-community/whisper-small-mlx",
346
+ inference_framework=InferenceAsrFramework.MLX,
347
+ language="en",
348
+ task="transcribe",
349
+ word_timestamps=True,
350
+ no_speech_threshold=0.6,
351
+ logprob_threshold=-1.0,
352
+ compression_ratio_threshold=2.4,
353
+ )
354
+
355
+ WHISPER_MEDIUM_MLX = InlineAsrMlxWhisperOptions(
356
+ repo_id="mlx-community/whisper-medium-mlx-8bit",
357
+ inference_framework=InferenceAsrFramework.MLX,
358
+ language="en",
359
+ task="transcribe",
360
+ word_timestamps=True,
361
+ no_speech_threshold=0.6,
362
+ logprob_threshold=-1.0,
363
+ compression_ratio_threshold=2.4,
364
+ )
365
+
366
+ WHISPER_BASE_MLX = InlineAsrMlxWhisperOptions(
367
+ repo_id="mlx-community/whisper-base-mlx",
368
+ inference_framework=InferenceAsrFramework.MLX,
369
+ language="en",
370
+ task="transcribe",
371
+ word_timestamps=True,
372
+ no_speech_threshold=0.6,
373
+ logprob_threshold=-1.0,
374
+ compression_ratio_threshold=2.4,
375
+ )
376
+
377
+ WHISPER_LARGE_MLX = InlineAsrMlxWhisperOptions(
378
+ repo_id="mlx-community/whisper-large-mlx-8bit",
379
+ inference_framework=InferenceAsrFramework.MLX,
380
+ language="en",
381
+ task="transcribe",
382
+ word_timestamps=True,
383
+ no_speech_threshold=0.6,
384
+ logprob_threshold=-1.0,
385
+ compression_ratio_threshold=2.4,
386
+ )
387
+
388
+ WHISPER_TURBO_MLX = InlineAsrMlxWhisperOptions(
389
+ repo_id="mlx-community/whisper-turbo",
390
+ inference_framework=InferenceAsrFramework.MLX,
391
+ language="en",
392
+ task="transcribe",
393
+ word_timestamps=True,
394
+ no_speech_threshold=0.6,
395
+ logprob_threshold=-1.0,
396
+ compression_ratio_threshold=2.4,
397
+ )
398
+
399
+ # Explicit Native Whisper model options for users who want to force native usage
400
+ WHISPER_TINY_NATIVE = InlineAsrNativeWhisperOptions(
20
401
  repo_id="tiny",
21
402
  inference_framework=InferenceAsrFramework.WHISPER,
22
403
  verbose=True,
@@ -27,7 +408,7 @@ WHISPER_TINY = InlineAsrNativeWhisperOptions(
27
408
  max_time_chunk=30.0,
28
409
  )
29
410
 
30
- WHISPER_SMALL = InlineAsrNativeWhisperOptions(
411
+ WHISPER_SMALL_NATIVE = InlineAsrNativeWhisperOptions(
31
412
  repo_id="small",
32
413
  inference_framework=InferenceAsrFramework.WHISPER,
33
414
  verbose=True,
@@ -38,7 +419,7 @@ WHISPER_SMALL = InlineAsrNativeWhisperOptions(
38
419
  max_time_chunk=30.0,
39
420
  )
40
421
 
41
- WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
422
+ WHISPER_MEDIUM_NATIVE = InlineAsrNativeWhisperOptions(
42
423
  repo_id="medium",
43
424
  inference_framework=InferenceAsrFramework.WHISPER,
44
425
  verbose=True,
@@ -49,7 +430,7 @@ WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
49
430
  max_time_chunk=30.0,
50
431
  )
51
432
 
52
- WHISPER_BASE = InlineAsrNativeWhisperOptions(
433
+ WHISPER_BASE_NATIVE = InlineAsrNativeWhisperOptions(
53
434
  repo_id="base",
54
435
  inference_framework=InferenceAsrFramework.WHISPER,
55
436
  verbose=True,
@@ -60,7 +441,7 @@ WHISPER_BASE = InlineAsrNativeWhisperOptions(
60
441
  max_time_chunk=30.0,
61
442
  )
62
443
 
63
- WHISPER_LARGE = InlineAsrNativeWhisperOptions(
444
+ WHISPER_LARGE_NATIVE = InlineAsrNativeWhisperOptions(
64
445
  repo_id="large",
65
446
  inference_framework=InferenceAsrFramework.WHISPER,
66
447
  verbose=True,
@@ -71,7 +452,7 @@ WHISPER_LARGE = InlineAsrNativeWhisperOptions(
71
452
  max_time_chunk=30.0,
72
453
  )
73
454
 
74
- WHISPER_TURBO = InlineAsrNativeWhisperOptions(
455
+ WHISPER_TURBO_NATIVE = InlineAsrNativeWhisperOptions(
75
456
  repo_id="turbo",
76
457
  inference_framework=InferenceAsrFramework.WHISPER,
77
458
  verbose=True,
@@ -82,11 +463,32 @@ WHISPER_TURBO = InlineAsrNativeWhisperOptions(
82
463
  max_time_chunk=30.0,
83
464
  )
84
465
 
466
+ # Note: The main WHISPER_* models (WHISPER_TURBO, WHISPER_BASE, etc.) automatically
467
+ # select the best implementation (MLX on Apple Silicon, Native elsewhere).
468
+ # Use the explicit _MLX or _NATIVE variants if you need to force a specific implementation.
469
+
85
470
 
86
471
  class AsrModelType(str, Enum):
472
+ # Auto-selecting models (choose best implementation for hardware)
87
473
  WHISPER_TINY = "whisper_tiny"
88
474
  WHISPER_SMALL = "whisper_small"
89
475
  WHISPER_MEDIUM = "whisper_medium"
90
476
  WHISPER_BASE = "whisper_base"
91
477
  WHISPER_LARGE = "whisper_large"
92
478
  WHISPER_TURBO = "whisper_turbo"
479
+
480
+ # Explicit MLX models (force MLX implementation)
481
+ WHISPER_TINY_MLX = "whisper_tiny_mlx"
482
+ WHISPER_SMALL_MLX = "whisper_small_mlx"
483
+ WHISPER_MEDIUM_MLX = "whisper_medium_mlx"
484
+ WHISPER_BASE_MLX = "whisper_base_mlx"
485
+ WHISPER_LARGE_MLX = "whisper_large_mlx"
486
+ WHISPER_TURBO_MLX = "whisper_turbo_mlx"
487
+
488
+ # Explicit Native models (force native implementation)
489
+ WHISPER_TINY_NATIVE = "whisper_tiny_native"
490
+ WHISPER_SMALL_NATIVE = "whisper_small_native"
491
+ WHISPER_MEDIUM_NATIVE = "whisper_medium_native"
492
+ WHISPER_BASE_NATIVE = "whisper_base_native"
493
+ WHISPER_LARGE_NATIVE = "whisper_large_native"
494
+ WHISPER_TURBO_NATIVE = "whisper_turbo_native"
@@ -0,0 +1,82 @@
1
+ from pathlib import PurePath
2
+ from typing import Annotated, Literal, Optional, Union
3
+
4
+ from pydantic import AnyUrl, BaseModel, Field, SecretStr
5
+
6
+
7
+ class BaseBackendOptions(BaseModel):
8
+ """Common options for all declarative document backends."""
9
+
10
+ enable_remote_fetch: bool = Field(
11
+ False, description="Enable remote resource fetching."
12
+ )
13
+ enable_local_fetch: bool = Field(
14
+ False, description="Enable local resource fetching."
15
+ )
16
+
17
+
18
+ class DeclarativeBackendOptions(BaseBackendOptions):
19
+ """Default backend options for a declarative document backend."""
20
+
21
+ kind: Literal["declarative"] = Field("declarative", exclude=True, repr=False)
22
+
23
+
24
+ class HTMLBackendOptions(BaseBackendOptions):
25
+ """Options specific to the HTML backend.
26
+
27
+ This class can be extended to include options specific to HTML processing.
28
+ """
29
+
30
+ kind: Literal["html"] = Field("html", exclude=True, repr=False)
31
+ fetch_images: bool = Field(
32
+ False,
33
+ description=(
34
+ "Whether the backend should access remote or local resources to parse "
35
+ "images in an HTML document."
36
+ ),
37
+ )
38
+ source_uri: Optional[Union[AnyUrl, PurePath]] = Field(
39
+ None,
40
+ description=(
41
+ "The URI that originates the HTML document. If provided, the backend "
42
+ "will use it to resolve relative paths in the HTML document."
43
+ ),
44
+ )
45
+
46
+
47
+ class MarkdownBackendOptions(BaseBackendOptions):
48
+ """Options specific to the Markdown backend."""
49
+
50
+ kind: Literal["md"] = Field("md", exclude=True, repr=False)
51
+ fetch_images: bool = Field(
52
+ False,
53
+ description=(
54
+ "Whether the backend should access remote or local resources to parse "
55
+ "images in the markdown document."
56
+ ),
57
+ )
58
+ source_uri: Optional[Union[AnyUrl, PurePath]] = Field(
59
+ None,
60
+ description=(
61
+ "The URI that originates the markdown document. If provided, the backend "
62
+ "will use it to resolve relative paths in the markdown document."
63
+ ),
64
+ )
65
+
66
+
67
+ class PdfBackendOptions(BaseBackendOptions):
68
+ """Backend options for pdf document backends."""
69
+
70
+ kind: Literal["pdf"] = Field("pdf", exclude=True, repr=False)
71
+ password: Optional[SecretStr] = None
72
+
73
+
74
+ BackendOptions = Annotated[
75
+ Union[
76
+ DeclarativeBackendOptions,
77
+ HTMLBackendOptions,
78
+ MarkdownBackendOptions,
79
+ PdfBackendOptions,
80
+ ],
81
+ Field(discriminator="kind"),
82
+ ]
@@ -94,7 +94,7 @@ FormatToExtensions: dict[InputFormat, list[str]] = {
94
94
  InputFormat.XML_USPTO: ["xml", "txt"],
95
95
  InputFormat.METS_GBS: ["tar.gz"],
96
96
  InputFormat.JSON_DOCLING: ["json"],
97
- InputFormat.AUDIO: ["wav", "mp3"],
97
+ InputFormat.AUDIO: ["wav", "mp3", "m4a", "aac", "ogg", "flac", "mp4", "avi", "mov"],
98
98
  InputFormat.VTT: ["vtt"],
99
99
  }
100
100
 
@@ -128,7 +128,22 @@ FormatToMimeType: dict[InputFormat, list[str]] = {
128
128
  InputFormat.XML_USPTO: ["application/xml", "text/plain"],
129
129
  InputFormat.METS_GBS: ["application/mets+xml"],
130
130
  InputFormat.JSON_DOCLING: ["application/json"],
131
- InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
131
+ InputFormat.AUDIO: [
132
+ "audio/x-wav",
133
+ "audio/mpeg",
134
+ "audio/wav",
135
+ "audio/mp3",
136
+ "audio/mp4",
137
+ "audio/m4a",
138
+ "audio/aac",
139
+ "audio/ogg",
140
+ "audio/flac",
141
+ "audio/x-flac",
142
+ "video/mp4",
143
+ "video/avi",
144
+ "video/x-msvideo",
145
+ "video/quicktime",
146
+ ],
132
147
  InputFormat.VTT: ["text/vtt"],
133
148
  }
134
149