docling 2.69.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/__init__.py +0 -0
- docling/backend/__init__.py +0 -0
- docling/backend/abstract_backend.py +84 -0
- docling/backend/asciidoc_backend.py +443 -0
- docling/backend/csv_backend.py +125 -0
- docling/backend/docling_parse_backend.py +237 -0
- docling/backend/docling_parse_v2_backend.py +276 -0
- docling/backend/docling_parse_v4_backend.py +260 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +274 -0
- docling/backend/docx/latex/omml.py +459 -0
- docling/backend/html_backend.py +1502 -0
- docling/backend/image_backend.py +188 -0
- docling/backend/json/__init__.py +0 -0
- docling/backend/json/docling_json_backend.py +58 -0
- docling/backend/md_backend.py +618 -0
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/msexcel_backend.py +686 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +1663 -0
- docling/backend/noop_backend.py +51 -0
- docling/backend/pdf_backend.py +82 -0
- docling/backend/pypdfium2_backend.py +417 -0
- docling/backend/webvtt_backend.py +572 -0
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/jats_backend.py +819 -0
- docling/backend/xml/uspto_backend.py +1905 -0
- docling/chunking/__init__.py +12 -0
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +974 -0
- docling/cli/models.py +196 -0
- docling/cli/tools.py +17 -0
- docling/datamodel/__init__.py +0 -0
- docling/datamodel/accelerator_options.py +69 -0
- docling/datamodel/asr_model_specs.py +494 -0
- docling/datamodel/backend_options.py +102 -0
- docling/datamodel/base_models.py +493 -0
- docling/datamodel/document.py +699 -0
- docling/datamodel/extraction.py +39 -0
- docling/datamodel/layout_model_specs.py +91 -0
- docling/datamodel/pipeline_options.py +457 -0
- docling/datamodel/pipeline_options_asr_model.py +78 -0
- docling/datamodel/pipeline_options_vlm_model.py +136 -0
- docling/datamodel/settings.py +65 -0
- docling/datamodel/vlm_model_specs.py +365 -0
- docling/document_converter.py +559 -0
- docling/document_extractor.py +327 -0
- docling/exceptions.py +10 -0
- docling/experimental/__init__.py +5 -0
- docling/experimental/datamodel/__init__.py +1 -0
- docling/experimental/datamodel/table_crops_layout_options.py +13 -0
- docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
- docling/experimental/models/__init__.py +3 -0
- docling/experimental/models/table_crops_layout_model.py +114 -0
- docling/experimental/pipeline/__init__.py +1 -0
- docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
- docling/models/__init__.py +0 -0
- docling/models/base_layout_model.py +39 -0
- docling/models/base_model.py +230 -0
- docling/models/base_ocr_model.py +241 -0
- docling/models/base_table_model.py +45 -0
- docling/models/extraction/__init__.py +0 -0
- docling/models/extraction/nuextract_transformers_model.py +305 -0
- docling/models/factories/__init__.py +47 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/layout_factory.py +7 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/factories/table_factory.py +7 -0
- docling/models/picture_description_base_model.py +149 -0
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +60 -0
- docling/models/stages/__init__.py +0 -0
- docling/models/stages/code_formula/__init__.py +0 -0
- docling/models/stages/code_formula/code_formula_model.py +342 -0
- docling/models/stages/layout/__init__.py +0 -0
- docling/models/stages/layout/layout_model.py +249 -0
- docling/models/stages/ocr/__init__.py +0 -0
- docling/models/stages/ocr/auto_ocr_model.py +132 -0
- docling/models/stages/ocr/easyocr_model.py +200 -0
- docling/models/stages/ocr/ocr_mac_model.py +145 -0
- docling/models/stages/ocr/rapid_ocr_model.py +328 -0
- docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
- docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
- docling/models/stages/page_assemble/__init__.py +0 -0
- docling/models/stages/page_assemble/page_assemble_model.py +156 -0
- docling/models/stages/page_preprocessing/__init__.py +0 -0
- docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
- docling/models/stages/picture_classifier/__init__.py +0 -0
- docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
- docling/models/stages/picture_description/__init__.py +0 -0
- docling/models/stages/picture_description/picture_description_api_model.py +66 -0
- docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
- docling/models/stages/reading_order/__init__.py +0 -0
- docling/models/stages/reading_order/readingorder_model.py +431 -0
- docling/models/stages/table_structure/__init__.py +0 -0
- docling/models/stages/table_structure/table_structure_model.py +305 -0
- docling/models/utils/__init__.py +0 -0
- docling/models/utils/generation_utils.py +157 -0
- docling/models/utils/hf_model_download.py +45 -0
- docling/models/vlm_pipeline_models/__init__.py +1 -0
- docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
- docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
- docling/models/vlm_pipeline_models/mlx_model.py +325 -0
- docling/models/vlm_pipeline_models/vllm_model.py +344 -0
- docling/pipeline/__init__.py +0 -0
- docling/pipeline/asr_pipeline.py +431 -0
- docling/pipeline/base_extraction_pipeline.py +72 -0
- docling/pipeline/base_pipeline.py +326 -0
- docling/pipeline/extraction_vlm_pipeline.py +207 -0
- docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
- docling/pipeline/simple_pipeline.py +55 -0
- docling/pipeline/standard_pdf_pipeline.py +859 -0
- docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
- docling/pipeline/vlm_pipeline.py +416 -0
- docling/py.typed +1 -0
- docling/utils/__init__.py +0 -0
- docling/utils/accelerator_utils.py +97 -0
- docling/utils/api_image_request.py +205 -0
- docling/utils/deepseekocr_utils.py +388 -0
- docling/utils/export.py +146 -0
- docling/utils/glm_utils.py +361 -0
- docling/utils/layout_postprocessor.py +683 -0
- docling/utils/locks.py +3 -0
- docling/utils/model_downloader.py +168 -0
- docling/utils/ocr_utils.py +69 -0
- docling/utils/orientation.py +65 -0
- docling/utils/profiling.py +65 -0
- docling/utils/utils.py +65 -0
- docling/utils/visualization.py +85 -0
- docling-2.69.0.dist-info/METADATA +237 -0
- docling-2.69.0.dist-info/RECORD +138 -0
- docling-2.69.0.dist-info/WHEEL +5 -0
- docling-2.69.0.dist-info/entry_points.txt +6 -0
- docling-2.69.0.dist-info/licenses/LICENSE +21 -0
- docling-2.69.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,431 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
import sys
|
|
5
|
+
import tempfile
|
|
6
|
+
from io import BytesIO
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import TYPE_CHECKING, List, Optional, Union, cast
|
|
9
|
+
|
|
10
|
+
from docling_core.types.doc import DoclingDocument, DocumentOrigin
|
|
11
|
+
|
|
12
|
+
# import whisper # type: ignore
|
|
13
|
+
# import librosa
|
|
14
|
+
# import numpy as np
|
|
15
|
+
# import soundfile as sf # type: ignore
|
|
16
|
+
from docling_core.types.doc.labels import DocItemLabel
|
|
17
|
+
from pydantic import BaseModel, Field, validator
|
|
18
|
+
|
|
19
|
+
from docling.backend.abstract_backend import AbstractDocumentBackend
|
|
20
|
+
from docling.backend.noop_backend import NoOpBackend
|
|
21
|
+
|
|
22
|
+
# from pydub import AudioSegment # type: ignore
|
|
23
|
+
# from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
|
|
24
|
+
from docling.datamodel.accelerator_options import (
|
|
25
|
+
AcceleratorOptions,
|
|
26
|
+
)
|
|
27
|
+
from docling.datamodel.base_models import (
|
|
28
|
+
ConversionStatus,
|
|
29
|
+
FormatToMimeType,
|
|
30
|
+
)
|
|
31
|
+
from docling.datamodel.document import ConversionResult, InputDocument
|
|
32
|
+
from docling.datamodel.pipeline_options import (
|
|
33
|
+
AsrPipelineOptions,
|
|
34
|
+
)
|
|
35
|
+
from docling.datamodel.pipeline_options_asr_model import (
|
|
36
|
+
InlineAsrMlxWhisperOptions,
|
|
37
|
+
InlineAsrNativeWhisperOptions,
|
|
38
|
+
# AsrResponseFormat,
|
|
39
|
+
InlineAsrOptions,
|
|
40
|
+
)
|
|
41
|
+
from docling.datamodel.pipeline_options_vlm_model import (
|
|
42
|
+
InferenceFramework,
|
|
43
|
+
)
|
|
44
|
+
from docling.datamodel.settings import settings
|
|
45
|
+
from docling.pipeline.base_pipeline import BasePipeline
|
|
46
|
+
from docling.utils.accelerator_utils import decide_device
|
|
47
|
+
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
|
48
|
+
|
|
49
|
+
_log = logging.getLogger(__name__)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class _ConversationWord(BaseModel):
|
|
53
|
+
text: str
|
|
54
|
+
start_time: Optional[float] = Field(
|
|
55
|
+
None, description="Start time in seconds from video start"
|
|
56
|
+
)
|
|
57
|
+
end_time: Optional[float] = Field(
|
|
58
|
+
None, ge=0, description="End time in seconds from video start"
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class _ConversationItem(BaseModel):
|
|
63
|
+
text: str
|
|
64
|
+
start_time: Optional[float] = Field(
|
|
65
|
+
None, description="Start time in seconds from video start"
|
|
66
|
+
)
|
|
67
|
+
end_time: Optional[float] = Field(
|
|
68
|
+
None, ge=0, description="End time in seconds from video start"
|
|
69
|
+
)
|
|
70
|
+
speaker_id: Optional[int] = Field(None, description="Numeric speaker identifier")
|
|
71
|
+
speaker: Optional[str] = Field(
|
|
72
|
+
None, description="Speaker name, defaults to speaker-{speaker_id}"
|
|
73
|
+
)
|
|
74
|
+
words: Optional[list[_ConversationWord]] = Field(
|
|
75
|
+
None, description="Individual words with time-stamps"
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
def __lt__(self, other):
|
|
79
|
+
if not isinstance(other, _ConversationItem):
|
|
80
|
+
return NotImplemented
|
|
81
|
+
return self.start_time < other.start_time
|
|
82
|
+
|
|
83
|
+
def __eq__(self, other):
|
|
84
|
+
if not isinstance(other, _ConversationItem):
|
|
85
|
+
return NotImplemented
|
|
86
|
+
return self.start_time == other.start_time
|
|
87
|
+
|
|
88
|
+
def to_string(self) -> str:
|
|
89
|
+
"""Format the conversation entry as a string"""
|
|
90
|
+
result = ""
|
|
91
|
+
if (self.start_time is not None) and (self.end_time is not None):
|
|
92
|
+
result += f"[time: {self.start_time}-{self.end_time}] "
|
|
93
|
+
|
|
94
|
+
if self.speaker is not None:
|
|
95
|
+
result += f"[speaker:{self.speaker}] "
|
|
96
|
+
|
|
97
|
+
result += self.text
|
|
98
|
+
return result
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class _NativeWhisperModel:
|
|
102
|
+
def __init__(
|
|
103
|
+
self,
|
|
104
|
+
enabled: bool,
|
|
105
|
+
artifacts_path: Optional[Path],
|
|
106
|
+
accelerator_options: AcceleratorOptions,
|
|
107
|
+
asr_options: InlineAsrNativeWhisperOptions,
|
|
108
|
+
):
|
|
109
|
+
"""
|
|
110
|
+
Transcriber using native Whisper.
|
|
111
|
+
"""
|
|
112
|
+
self.enabled = enabled
|
|
113
|
+
|
|
114
|
+
_log.info(f"artifacts-path: {artifacts_path}")
|
|
115
|
+
_log.info(f"accelerator_options: {accelerator_options}")
|
|
116
|
+
|
|
117
|
+
if self.enabled:
|
|
118
|
+
try:
|
|
119
|
+
import whisper # type: ignore
|
|
120
|
+
except ImportError:
|
|
121
|
+
if sys.version_info < (3, 14):
|
|
122
|
+
raise ImportError(
|
|
123
|
+
"whisper is not installed. Please install it via `pip install openai-whisper` or do `uv sync --extra asr`."
|
|
124
|
+
)
|
|
125
|
+
else:
|
|
126
|
+
raise ImportError(
|
|
127
|
+
"whisper is not installed. Unfortunately its dependencies are not yet available for Python 3.14."
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
self.asr_options = asr_options
|
|
131
|
+
self.max_tokens = asr_options.max_new_tokens
|
|
132
|
+
self.temperature = asr_options.temperature
|
|
133
|
+
|
|
134
|
+
self.device = decide_device(
|
|
135
|
+
accelerator_options.device,
|
|
136
|
+
supported_devices=asr_options.supported_devices,
|
|
137
|
+
)
|
|
138
|
+
_log.info(f"Available device for Whisper: {self.device}")
|
|
139
|
+
|
|
140
|
+
self.model_name = asr_options.repo_id
|
|
141
|
+
_log.info(f"loading _NativeWhisperModel({self.model_name})")
|
|
142
|
+
if artifacts_path is not None:
|
|
143
|
+
_log.info(f"loading {self.model_name} from {artifacts_path}")
|
|
144
|
+
self.model = whisper.load_model(
|
|
145
|
+
name=self.model_name,
|
|
146
|
+
device=self.device,
|
|
147
|
+
download_root=str(artifacts_path),
|
|
148
|
+
)
|
|
149
|
+
else:
|
|
150
|
+
self.model = whisper.load_model(
|
|
151
|
+
name=self.model_name, device=self.device
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
self.verbose = asr_options.verbose
|
|
155
|
+
self.timestamps = asr_options.timestamps
|
|
156
|
+
self.word_timestamps = asr_options.word_timestamps
|
|
157
|
+
|
|
158
|
+
def run(self, conv_res: ConversionResult) -> ConversionResult:
|
|
159
|
+
# Access the file path from the backend, similar to how other pipelines handle it
|
|
160
|
+
path_or_stream = conv_res.input._backend.path_or_stream
|
|
161
|
+
|
|
162
|
+
# Handle both Path and BytesIO inputs
|
|
163
|
+
temp_file_path: Optional[Path] = None
|
|
164
|
+
|
|
165
|
+
if isinstance(path_or_stream, BytesIO):
|
|
166
|
+
# For BytesIO, write to a temporary file since whisper requires a file path
|
|
167
|
+
suffix = Path(conv_res.input.file.name).suffix or ".wav"
|
|
168
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
|
|
169
|
+
tmp_file.write(path_or_stream.getvalue())
|
|
170
|
+
temp_file_path = Path(tmp_file.name)
|
|
171
|
+
audio_path = temp_file_path
|
|
172
|
+
elif isinstance(path_or_stream, Path):
|
|
173
|
+
audio_path = path_or_stream
|
|
174
|
+
else:
|
|
175
|
+
raise RuntimeError(
|
|
176
|
+
f"ASR pipeline requires a file path or BytesIO stream, but got {type(path_or_stream)}"
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
try:
|
|
180
|
+
conversation = self.transcribe(audio_path)
|
|
181
|
+
|
|
182
|
+
# Ensure we have a proper DoclingDocument
|
|
183
|
+
origin = DocumentOrigin(
|
|
184
|
+
filename=conv_res.input.file.name or "audio.wav",
|
|
185
|
+
mimetype="audio/x-wav",
|
|
186
|
+
binary_hash=conv_res.input.document_hash,
|
|
187
|
+
)
|
|
188
|
+
conv_res.document = DoclingDocument(
|
|
189
|
+
name=conv_res.input.file.stem or "audio.wav", origin=origin
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
for citem in conversation:
|
|
193
|
+
conv_res.document.add_text(
|
|
194
|
+
label=DocItemLabel.TEXT, text=citem.to_string()
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
return conv_res
|
|
198
|
+
|
|
199
|
+
except Exception as exc:
|
|
200
|
+
_log.error(f"Audio tranciption has an error: {exc}")
|
|
201
|
+
conv_res.status = ConversionStatus.FAILURE
|
|
202
|
+
return conv_res
|
|
203
|
+
|
|
204
|
+
finally:
|
|
205
|
+
# Clean up temporary file if created
|
|
206
|
+
if temp_file_path is not None and temp_file_path.exists():
|
|
207
|
+
try:
|
|
208
|
+
temp_file_path.unlink()
|
|
209
|
+
except Exception as e:
|
|
210
|
+
_log.warning(
|
|
211
|
+
f"Failed to delete temporary file {temp_file_path}: {e}"
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
def transcribe(self, fpath: Path) -> list[_ConversationItem]:
|
|
215
|
+
result = self.model.transcribe(
|
|
216
|
+
str(fpath), verbose=self.verbose, word_timestamps=self.word_timestamps
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
convo: list[_ConversationItem] = []
|
|
220
|
+
for _ in result["segments"]:
|
|
221
|
+
item = _ConversationItem(
|
|
222
|
+
start_time=_["start"], end_time=_["end"], text=_["text"], words=[]
|
|
223
|
+
)
|
|
224
|
+
if "words" in _ and self.word_timestamps:
|
|
225
|
+
item.words = []
|
|
226
|
+
for __ in _["words"]:
|
|
227
|
+
item.words.append(
|
|
228
|
+
_ConversationWord(
|
|
229
|
+
start_time=__["start"],
|
|
230
|
+
end_time=__["end"],
|
|
231
|
+
text=__["word"],
|
|
232
|
+
)
|
|
233
|
+
)
|
|
234
|
+
convo.append(item)
|
|
235
|
+
|
|
236
|
+
return convo
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
class _MlxWhisperModel:
|
|
240
|
+
def __init__(
|
|
241
|
+
self,
|
|
242
|
+
enabled: bool,
|
|
243
|
+
artifacts_path: Optional[Path],
|
|
244
|
+
accelerator_options: AcceleratorOptions,
|
|
245
|
+
asr_options: InlineAsrMlxWhisperOptions,
|
|
246
|
+
):
|
|
247
|
+
"""
|
|
248
|
+
Transcriber using MLX Whisper for Apple Silicon optimization.
|
|
249
|
+
"""
|
|
250
|
+
self.enabled = enabled
|
|
251
|
+
|
|
252
|
+
_log.info(f"artifacts-path: {artifacts_path}")
|
|
253
|
+
_log.info(f"accelerator_options: {accelerator_options}")
|
|
254
|
+
|
|
255
|
+
if self.enabled:
|
|
256
|
+
try:
|
|
257
|
+
import mlx_whisper # type: ignore
|
|
258
|
+
except ImportError:
|
|
259
|
+
raise ImportError(
|
|
260
|
+
"mlx-whisper is not installed. Please install it via `pip install mlx-whisper` or do `uv sync --extra asr`."
|
|
261
|
+
)
|
|
262
|
+
self.asr_options = asr_options
|
|
263
|
+
self.mlx_whisper = mlx_whisper
|
|
264
|
+
|
|
265
|
+
self.device = decide_device(
|
|
266
|
+
accelerator_options.device,
|
|
267
|
+
supported_devices=asr_options.supported_devices,
|
|
268
|
+
)
|
|
269
|
+
_log.info(f"Available device for MLX Whisper: {self.device}")
|
|
270
|
+
|
|
271
|
+
self.model_name = asr_options.repo_id
|
|
272
|
+
_log.info(f"loading _MlxWhisperModel({self.model_name})")
|
|
273
|
+
|
|
274
|
+
# MLX Whisper models are loaded differently - they use HuggingFace repos
|
|
275
|
+
self.model_path = self.model_name
|
|
276
|
+
|
|
277
|
+
# Store MLX-specific options
|
|
278
|
+
self.language = asr_options.language
|
|
279
|
+
self.task = asr_options.task
|
|
280
|
+
self.word_timestamps = asr_options.word_timestamps
|
|
281
|
+
self.no_speech_threshold = asr_options.no_speech_threshold
|
|
282
|
+
self.logprob_threshold = asr_options.logprob_threshold
|
|
283
|
+
self.compression_ratio_threshold = asr_options.compression_ratio_threshold
|
|
284
|
+
|
|
285
|
+
def run(self, conv_res: ConversionResult) -> ConversionResult:
|
|
286
|
+
audio_path: Path = Path(conv_res.input.file).resolve()
|
|
287
|
+
|
|
288
|
+
try:
|
|
289
|
+
conversation = self.transcribe(audio_path)
|
|
290
|
+
|
|
291
|
+
# Ensure we have a proper DoclingDocument
|
|
292
|
+
origin = DocumentOrigin(
|
|
293
|
+
filename=conv_res.input.file.name or "audio.wav",
|
|
294
|
+
mimetype="audio/x-wav",
|
|
295
|
+
binary_hash=conv_res.input.document_hash,
|
|
296
|
+
)
|
|
297
|
+
conv_res.document = DoclingDocument(
|
|
298
|
+
name=conv_res.input.file.stem or "audio.wav", origin=origin
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
for citem in conversation:
|
|
302
|
+
conv_res.document.add_text(
|
|
303
|
+
label=DocItemLabel.TEXT, text=citem.to_string()
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
conv_res.status = ConversionStatus.SUCCESS
|
|
307
|
+
return conv_res
|
|
308
|
+
|
|
309
|
+
except Exception as exc:
|
|
310
|
+
_log.error(f"MLX Audio transcription has an error: {exc}")
|
|
311
|
+
|
|
312
|
+
conv_res.status = ConversionStatus.FAILURE
|
|
313
|
+
return conv_res
|
|
314
|
+
|
|
315
|
+
def transcribe(self, fpath: Path) -> list[_ConversationItem]:
|
|
316
|
+
"""
|
|
317
|
+
Transcribe audio using MLX Whisper.
|
|
318
|
+
|
|
319
|
+
Args:
|
|
320
|
+
fpath: Path to audio file
|
|
321
|
+
|
|
322
|
+
Returns:
|
|
323
|
+
List of conversation items with timestamps
|
|
324
|
+
"""
|
|
325
|
+
result = self.mlx_whisper.transcribe(
|
|
326
|
+
str(fpath),
|
|
327
|
+
path_or_hf_repo=self.model_path,
|
|
328
|
+
language=self.language,
|
|
329
|
+
task=self.task,
|
|
330
|
+
word_timestamps=self.word_timestamps,
|
|
331
|
+
no_speech_threshold=self.no_speech_threshold,
|
|
332
|
+
logprob_threshold=self.logprob_threshold,
|
|
333
|
+
compression_ratio_threshold=self.compression_ratio_threshold,
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
convo: list[_ConversationItem] = []
|
|
337
|
+
|
|
338
|
+
# MLX Whisper returns segments similar to native Whisper
|
|
339
|
+
for segment in result.get("segments", []):
|
|
340
|
+
item = _ConversationItem(
|
|
341
|
+
start_time=segment.get("start"),
|
|
342
|
+
end_time=segment.get("end"),
|
|
343
|
+
text=segment.get("text", "").strip(),
|
|
344
|
+
words=[],
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
# Add word-level timestamps if available
|
|
348
|
+
if self.word_timestamps and "words" in segment:
|
|
349
|
+
item.words = []
|
|
350
|
+
for word_data in segment["words"]:
|
|
351
|
+
item.words.append(
|
|
352
|
+
_ConversationWord(
|
|
353
|
+
start_time=word_data.get("start"),
|
|
354
|
+
end_time=word_data.get("end"),
|
|
355
|
+
text=word_data.get("word", ""),
|
|
356
|
+
)
|
|
357
|
+
)
|
|
358
|
+
convo.append(item)
|
|
359
|
+
|
|
360
|
+
return convo
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
class AsrPipeline(BasePipeline):
|
|
364
|
+
def __init__(self, pipeline_options: AsrPipelineOptions):
|
|
365
|
+
super().__init__(pipeline_options)
|
|
366
|
+
self.keep_backend = True
|
|
367
|
+
|
|
368
|
+
self.pipeline_options: AsrPipelineOptions = pipeline_options
|
|
369
|
+
self._model: Union[_NativeWhisperModel, _MlxWhisperModel]
|
|
370
|
+
|
|
371
|
+
if isinstance(self.pipeline_options.asr_options, InlineAsrNativeWhisperOptions):
|
|
372
|
+
native_asr_options: InlineAsrNativeWhisperOptions = (
|
|
373
|
+
self.pipeline_options.asr_options
|
|
374
|
+
)
|
|
375
|
+
self._model = _NativeWhisperModel(
|
|
376
|
+
enabled=True, # must be always enabled for this pipeline to make sense.
|
|
377
|
+
artifacts_path=self.artifacts_path,
|
|
378
|
+
accelerator_options=pipeline_options.accelerator_options,
|
|
379
|
+
asr_options=native_asr_options,
|
|
380
|
+
)
|
|
381
|
+
elif isinstance(self.pipeline_options.asr_options, InlineAsrMlxWhisperOptions):
|
|
382
|
+
mlx_asr_options: InlineAsrMlxWhisperOptions = (
|
|
383
|
+
self.pipeline_options.asr_options
|
|
384
|
+
)
|
|
385
|
+
self._model = _MlxWhisperModel(
|
|
386
|
+
enabled=True, # must be always enabled for this pipeline to make sense.
|
|
387
|
+
artifacts_path=self.artifacts_path,
|
|
388
|
+
accelerator_options=pipeline_options.accelerator_options,
|
|
389
|
+
asr_options=mlx_asr_options,
|
|
390
|
+
)
|
|
391
|
+
else:
|
|
392
|
+
_log.error(f"No model support for {self.pipeline_options.asr_options}")
|
|
393
|
+
|
|
394
|
+
def _has_text(self, document: "DoclingDocument") -> bool:
|
|
395
|
+
"""
|
|
396
|
+
Helper method to check if the document contains any transcribed text.
|
|
397
|
+
A transcription is considered non-empty if the .texts list contains items with actual, non whitespace content.
|
|
398
|
+
"""
|
|
399
|
+
if not document or not document.texts:
|
|
400
|
+
return False
|
|
401
|
+
for item in document.texts:
|
|
402
|
+
if item.text and item.text.strip():
|
|
403
|
+
return True
|
|
404
|
+
return False
|
|
405
|
+
|
|
406
|
+
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
|
407
|
+
"""Determines the final status of ASR Conversion based on its result."""
|
|
408
|
+
if conv_res.status == ConversionStatus.FAILURE or conv_res.errors:
|
|
409
|
+
return ConversionStatus.FAILURE
|
|
410
|
+
if not self._has_text(conv_res.document):
|
|
411
|
+
_log.warning(
|
|
412
|
+
"ASR conversion resulted in an empty document."
|
|
413
|
+
f"File: {conv_res.input.file.name}"
|
|
414
|
+
)
|
|
415
|
+
return ConversionStatus.PARTIAL_SUCCESS
|
|
416
|
+
return ConversionStatus.SUCCESS
|
|
417
|
+
|
|
418
|
+
@classmethod
|
|
419
|
+
def get_default_options(cls) -> AsrPipelineOptions:
|
|
420
|
+
return AsrPipelineOptions()
|
|
421
|
+
|
|
422
|
+
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
|
423
|
+
_log.info(f"start _build_document in AsrPipeline: {conv_res.input.file}")
|
|
424
|
+
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
|
425
|
+
self._model.run(conv_res=conv_res)
|
|
426
|
+
|
|
427
|
+
return conv_res
|
|
428
|
+
|
|
429
|
+
@classmethod
|
|
430
|
+
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
|
431
|
+
return isinstance(backend, NoOpBackend)
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
from docling.datamodel.base_models import ConversionStatus, ErrorItem
|
|
7
|
+
from docling.datamodel.document import InputDocument
|
|
8
|
+
from docling.datamodel.extraction import ExtractionResult, ExtractionTemplateType
|
|
9
|
+
from docling.datamodel.pipeline_options import BaseOptions, PipelineOptions
|
|
10
|
+
from docling.datamodel.settings import settings
|
|
11
|
+
|
|
12
|
+
_log = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BaseExtractionPipeline(ABC):
|
|
16
|
+
def __init__(self, pipeline_options: PipelineOptions):
|
|
17
|
+
self.pipeline_options = pipeline_options
|
|
18
|
+
|
|
19
|
+
self.artifacts_path: Optional[Path] = None
|
|
20
|
+
if pipeline_options.artifacts_path is not None:
|
|
21
|
+
self.artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
|
|
22
|
+
elif settings.artifacts_path is not None:
|
|
23
|
+
self.artifacts_path = Path(settings.artifacts_path).expanduser()
|
|
24
|
+
|
|
25
|
+
if self.artifacts_path is not None and not self.artifacts_path.is_dir():
|
|
26
|
+
raise RuntimeError(
|
|
27
|
+
f"The value of {self.artifacts_path=} is not valid. "
|
|
28
|
+
"When defined, it must point to a folder containing all models required by the pipeline."
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
def execute(
|
|
32
|
+
self,
|
|
33
|
+
in_doc: InputDocument,
|
|
34
|
+
raises_on_error: bool,
|
|
35
|
+
template: Optional[ExtractionTemplateType] = None,
|
|
36
|
+
) -> ExtractionResult:
|
|
37
|
+
ext_res = ExtractionResult(input=in_doc)
|
|
38
|
+
|
|
39
|
+
try:
|
|
40
|
+
ext_res = self._extract_data(ext_res, template)
|
|
41
|
+
ext_res.status = self._determine_status(ext_res)
|
|
42
|
+
except Exception as e:
|
|
43
|
+
ext_res.status = ConversionStatus.FAILURE
|
|
44
|
+
error_item = ErrorItem(
|
|
45
|
+
component_type="extraction_pipeline",
|
|
46
|
+
module_name=self.__class__.__name__,
|
|
47
|
+
error_message=str(e),
|
|
48
|
+
)
|
|
49
|
+
ext_res.errors.append(error_item)
|
|
50
|
+
if raises_on_error:
|
|
51
|
+
raise e
|
|
52
|
+
|
|
53
|
+
return ext_res
|
|
54
|
+
|
|
55
|
+
@abstractmethod
|
|
56
|
+
def _extract_data(
|
|
57
|
+
self,
|
|
58
|
+
ext_res: ExtractionResult,
|
|
59
|
+
template: Optional[ExtractionTemplateType] = None,
|
|
60
|
+
) -> ExtractionResult:
|
|
61
|
+
"""Subclass must populate ext_res.pages/errors and return the result."""
|
|
62
|
+
raise NotImplementedError
|
|
63
|
+
|
|
64
|
+
@abstractmethod
|
|
65
|
+
def _determine_status(self, ext_res: ExtractionResult) -> ConversionStatus:
|
|
66
|
+
"""Subclass must decide SUCCESS/PARTIAL_SUCCESS/FAILURE based on ext_res."""
|
|
67
|
+
raise NotImplementedError
|
|
68
|
+
|
|
69
|
+
@classmethod
|
|
70
|
+
@abstractmethod
|
|
71
|
+
def get_default_options(cls) -> PipelineOptions:
|
|
72
|
+
pass
|