lattifai 1.0.4__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/__init__.py +10 -0
- lattifai/alignment/lattice1_aligner.py +64 -15
- lattifai/alignment/lattice1_worker.py +135 -50
- lattifai/alignment/segmenter.py +3 -2
- lattifai/alignment/tokenizer.py +14 -13
- lattifai/audio2.py +269 -70
- lattifai/caption/caption.py +213 -19
- lattifai/cli/__init__.py +2 -0
- lattifai/cli/alignment.py +2 -1
- lattifai/cli/app_installer.py +35 -33
- lattifai/cli/caption.py +9 -19
- lattifai/cli/diarization.py +108 -0
- lattifai/cli/server.py +3 -1
- lattifai/cli/transcribe.py +55 -38
- lattifai/cli/youtube.py +1 -0
- lattifai/client.py +42 -121
- lattifai/config/alignment.py +37 -2
- lattifai/config/caption.py +1 -1
- lattifai/config/media.py +23 -3
- lattifai/config/transcription.py +4 -0
- lattifai/diarization/lattifai.py +18 -7
- lattifai/errors.py +7 -3
- lattifai/mixin.py +45 -16
- lattifai/server/app.py +2 -1
- lattifai/transcription/__init__.py +1 -1
- lattifai/transcription/base.py +21 -2
- lattifai/transcription/gemini.py +127 -1
- lattifai/transcription/lattifai.py +30 -2
- lattifai/utils.py +96 -28
- lattifai/workflow/file_manager.py +15 -13
- lattifai/workflow/youtube.py +16 -1
- {lattifai-1.0.4.dist-info → lattifai-1.1.0.dist-info}/METADATA +86 -22
- lattifai-1.1.0.dist-info/RECORD +57 -0
- {lattifai-1.0.4.dist-info → lattifai-1.1.0.dist-info}/entry_points.txt +2 -0
- {lattifai-1.0.4.dist-info → lattifai-1.1.0.dist-info}/licenses/LICENSE +1 -1
- lattifai-1.0.4.dist-info/RECORD +0 -56
- {lattifai-1.0.4.dist-info → lattifai-1.1.0.dist-info}/WHEEL +0 -0
- {lattifai-1.0.4.dist-info → lattifai-1.1.0.dist-info}/top_level.txt +0 -0
lattifai/audio2.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from collections import namedtuple
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import BinaryIO,
|
|
5
|
+
from typing import BinaryIO, Optional, Tuple, Union
|
|
6
6
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
import soundfile as sf
|
|
@@ -16,8 +16,14 @@ from lattifai.errors import AudioLoadError
|
|
|
16
16
|
ChannelSelectorType = Union[int, str]
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
class AudioData(namedtuple("AudioData", ["sampling_rate", "ndarray", "
|
|
20
|
-
"""Audio data container with sampling rate
|
|
19
|
+
class AudioData(namedtuple("AudioData", ["sampling_rate", "ndarray", "path", "streaming_chunk_secs", "overlap_secs"])):
|
|
20
|
+
"""Audio data container with sampling rate and numpy array.
|
|
21
|
+
|
|
22
|
+
Supports iteration to stream audio chunks for processing long audio files.
|
|
23
|
+
The streaming_chunk_secs field indicates whether streaming mode should be used downstream.
|
|
24
|
+
The overlap_secs field specifies the overlap duration between consecutive chunks.
|
|
25
|
+
Note: tensor field removed to reduce memory usage. Convert ndarray to tensor on-demand.
|
|
26
|
+
"""
|
|
21
27
|
|
|
22
28
|
def __str__(self) -> str:
|
|
23
29
|
return self.path
|
|
@@ -27,6 +33,66 @@ class AudioData(namedtuple("AudioData", ["sampling_rate", "ndarray", "tensor", "
|
|
|
27
33
|
"""Duration of the audio in seconds."""
|
|
28
34
|
return self.ndarray.shape[-1] / self.sampling_rate
|
|
29
35
|
|
|
36
|
+
@property
|
|
37
|
+
def streaming_mode(self) -> bool:
|
|
38
|
+
"""Indicates whether streaming mode is enabled based on streaming_chunk_secs."""
|
|
39
|
+
if self.streaming_chunk_secs is not None:
|
|
40
|
+
return self.duration > self.streaming_chunk_secs * 1.1
|
|
41
|
+
return False
|
|
42
|
+
|
|
43
|
+
def __iter__(self):
|
|
44
|
+
"""Initialize iterator for chunk-based audio streaming.
|
|
45
|
+
|
|
46
|
+
Returns an iterator that yields audio chunks as AudioData instances.
|
|
47
|
+
Uses streaming_chunk_secs and overlap_secs from the instance.
|
|
48
|
+
"""
|
|
49
|
+
return self.iter_chunks()
|
|
50
|
+
|
|
51
|
+
def iter_chunks(
|
|
52
|
+
self,
|
|
53
|
+
chunk_secs: Optional[float] = None,
|
|
54
|
+
overlap_secs: Optional[float] = None,
|
|
55
|
+
):
|
|
56
|
+
"""Iterate over audio chunks with configurable duration and overlap.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
chunk_secs: Duration of each chunk in seconds (default: uses streaming_chunk_secs or 600.0).
|
|
60
|
+
overlap_secs: Overlap between consecutive chunks in seconds (default: uses overlap_secs or 0.0).
|
|
61
|
+
|
|
62
|
+
Yields:
|
|
63
|
+
AudioData: Chunks of audio data.
|
|
64
|
+
|
|
65
|
+
Example:
|
|
66
|
+
>>> audio = loader("long_audio.wav")
|
|
67
|
+
>>> for chunk in audio.iter_chunks(chunk_secs=60.0, overlap_secs=2.0):
|
|
68
|
+
... process(chunk)
|
|
69
|
+
"""
|
|
70
|
+
chunk_duration = chunk_secs or self.streaming_chunk_secs or 600.0
|
|
71
|
+
overlap_duration = overlap_secs or self.overlap_secs or 0.0
|
|
72
|
+
|
|
73
|
+
chunk_size = int(chunk_duration * self.sampling_rate)
|
|
74
|
+
overlap_size = int(overlap_duration * self.sampling_rate)
|
|
75
|
+
step_size = chunk_size - overlap_size
|
|
76
|
+
total_samples = self.ndarray.shape[-1]
|
|
77
|
+
|
|
78
|
+
current_offset = 0
|
|
79
|
+
while current_offset < total_samples:
|
|
80
|
+
start = current_offset
|
|
81
|
+
end = min(start + chunk_size, total_samples)
|
|
82
|
+
|
|
83
|
+
# Extract chunk from ndarray only
|
|
84
|
+
chunk_ndarray = self.ndarray[..., start:end]
|
|
85
|
+
|
|
86
|
+
yield AudioData(
|
|
87
|
+
sampling_rate=self.sampling_rate,
|
|
88
|
+
ndarray=chunk_ndarray,
|
|
89
|
+
path=f"{self.path}[{start/self.sampling_rate:.2f}s-{end/self.sampling_rate:.2f}s]",
|
|
90
|
+
streaming_chunk_secs=None,
|
|
91
|
+
overlap_secs=None,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
current_offset += step_size
|
|
95
|
+
|
|
30
96
|
|
|
31
97
|
class AudioLoader:
|
|
32
98
|
"""Load and preprocess audio files into AudioData format."""
|
|
@@ -45,62 +111,48 @@ class AudioLoader:
|
|
|
45
111
|
|
|
46
112
|
def _resample_audio(
|
|
47
113
|
self,
|
|
48
|
-
audio_sr: Tuple[
|
|
114
|
+
audio_sr: Tuple[np.ndarray, int],
|
|
49
115
|
sampling_rate: int,
|
|
50
116
|
device: Optional[str],
|
|
51
117
|
channel_selector: Optional[ChannelSelectorType],
|
|
52
|
-
) ->
|
|
118
|
+
) -> np.ndarray:
|
|
53
119
|
"""Resample audio to target sampling rate with channel selection.
|
|
54
120
|
|
|
55
121
|
Args:
|
|
56
|
-
audio_sr: Tuple of (
|
|
122
|
+
audio_sr: Tuple of (audio, original_sample_rate).
|
|
57
123
|
sampling_rate: Target sampling rate.
|
|
58
124
|
device: Device to perform resampling on.
|
|
59
125
|
channel_selector: How to select channels.
|
|
60
126
|
|
|
61
127
|
Returns:
|
|
62
|
-
Resampled audio
|
|
128
|
+
Resampled audio array of shape (1, T) or (C, T).
|
|
63
129
|
"""
|
|
64
130
|
audio, sr = audio_sr
|
|
65
131
|
|
|
66
132
|
if channel_selector is None:
|
|
67
133
|
# keep the original multi-channel signal
|
|
68
|
-
tensor = audio
|
|
134
|
+
tensor = audio.T
|
|
135
|
+
del audio # Free original audio memory
|
|
69
136
|
elif isinstance(channel_selector, int):
|
|
70
|
-
assert audio.shape[
|
|
71
|
-
tensor = audio[channel_selector : channel_selector + 1].
|
|
137
|
+
assert audio.shape[1] >= channel_selector, f"Invalid channel: {channel_selector}"
|
|
138
|
+
tensor = audio[:, channel_selector : channel_selector + 1].T.copy()
|
|
72
139
|
del audio
|
|
73
140
|
elif isinstance(channel_selector, str):
|
|
74
141
|
assert channel_selector == "average"
|
|
75
|
-
tensor =
|
|
142
|
+
tensor = np.mean(audio, axis=1, keepdims=True).T
|
|
76
143
|
del audio
|
|
77
144
|
else:
|
|
78
145
|
raise ValueError(f"Unsupported channel_selector: {channel_selector}")
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
# print(f"Selecting channels {channel_selector} from the signal with {num_channels} channels.")
|
|
82
|
-
# if max(channel_selector) >= num_channels:
|
|
83
|
-
# raise ValueError(
|
|
84
|
-
# f"Cannot select channel subset {channel_selector} from a signal with {num_channels} channels."
|
|
85
|
-
# )
|
|
86
|
-
# tensor = audio[channel_selector]
|
|
87
|
-
|
|
88
|
-
tensor = tensor.to(device)
|
|
146
|
+
|
|
147
|
+
# tensor: np.ndarray (channels, samples)
|
|
89
148
|
if sr != sampling_rate:
|
|
90
149
|
cache_key = (sr, sampling_rate, device)
|
|
91
150
|
if cache_key not in self._resampler_cache:
|
|
92
151
|
self._resampler_cache[cache_key] = get_or_create_resampler(sr, sampling_rate).to(device=device)
|
|
93
152
|
resampler = self._resampler_cache[cache_key]
|
|
94
153
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
if length > chunk_size:
|
|
98
|
-
resampled_chunks = []
|
|
99
|
-
for i in range(0, length, chunk_size):
|
|
100
|
-
resampled_chunks.append(resampler(tensor[..., i : i + chunk_size]))
|
|
101
|
-
tensor = torch.cat(resampled_chunks, dim=-1)
|
|
102
|
-
else:
|
|
103
|
-
tensor = resampler(tensor)
|
|
154
|
+
tensor = resampler(torch.from_numpy(tensor).to(device=device))
|
|
155
|
+
tensor = tensor.cpu().numpy()
|
|
104
156
|
|
|
105
157
|
return tensor
|
|
106
158
|
|
|
@@ -109,7 +161,7 @@ class AudioLoader:
|
|
|
109
161
|
audio: Union[Pathlike, BinaryIO],
|
|
110
162
|
sampling_rate: int,
|
|
111
163
|
channel_selector: Optional[ChannelSelectorType],
|
|
112
|
-
) ->
|
|
164
|
+
) -> np.ndarray:
|
|
113
165
|
"""Load audio from file or binary stream and resample to target rate.
|
|
114
166
|
|
|
115
167
|
Args:
|
|
@@ -118,7 +170,7 @@ class AudioLoader:
|
|
|
118
170
|
channel_selector: How to select channels.
|
|
119
171
|
|
|
120
172
|
Returns:
|
|
121
|
-
Resampled audio
|
|
173
|
+
Resampled audio as a NumPy array of shape (channels, samples).
|
|
122
174
|
|
|
123
175
|
Raises:
|
|
124
176
|
ImportError: If PyAV is needed but not installed.
|
|
@@ -128,11 +180,69 @@ class AudioLoader:
|
|
|
128
180
|
if isinstance(audio, Pathlike):
|
|
129
181
|
audio = str(Path(str(audio)).expanduser())
|
|
130
182
|
|
|
131
|
-
# load audio
|
|
183
|
+
# load audio in chunks to reduce memory footprint for long files
|
|
132
184
|
try:
|
|
133
|
-
|
|
134
|
-
|
|
185
|
+
# First check file duration to decide loading strategy
|
|
186
|
+
info = sf.info(audio)
|
|
187
|
+
duration = info.duration
|
|
188
|
+
|
|
189
|
+
# For very long audio (>60 minutes), use chunk-based loading
|
|
190
|
+
if duration > 3600: # 60 minutes
|
|
191
|
+
with sf.SoundFile(audio, "r") as f:
|
|
192
|
+
sample_rate = f.samplerate
|
|
193
|
+
total_frames = f.frames
|
|
194
|
+
|
|
195
|
+
# Pre-calculate output size to avoid list accumulation
|
|
196
|
+
num_channels = 1 if channel_selector else f.channels
|
|
197
|
+
expected_output_samples = int(total_frames * sampling_rate / sample_rate)
|
|
198
|
+
|
|
199
|
+
# Pre-allocate output array
|
|
200
|
+
waveform = np.zeros((num_channels, expected_output_samples), dtype=np.float32)
|
|
201
|
+
|
|
202
|
+
# Use source sample rate for reading, not target
|
|
203
|
+
chunk_frames = int(sample_rate * 1800) # 30-minute chunks at source rate
|
|
204
|
+
output_offset = 0
|
|
205
|
+
|
|
206
|
+
while True:
|
|
207
|
+
chunk = f.read(frames=chunk_frames, dtype="float32", always_2d=True)
|
|
208
|
+
if chunk.size == 0:
|
|
209
|
+
break
|
|
210
|
+
|
|
211
|
+
# Resample chunk -> (channels, samples)
|
|
212
|
+
resampled_chunk = self._resample_audio(
|
|
213
|
+
(chunk, sample_rate),
|
|
214
|
+
sampling_rate,
|
|
215
|
+
device=self.device,
|
|
216
|
+
channel_selector=channel_selector,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
# Write directly to pre-allocated array
|
|
220
|
+
chunk_length = resampled_chunk.shape[-1]
|
|
221
|
+
waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
|
|
222
|
+
output_offset += chunk_length
|
|
223
|
+
|
|
224
|
+
# Clean up immediately
|
|
225
|
+
del chunk, resampled_chunk
|
|
226
|
+
|
|
227
|
+
# Trim to actual size if needed (due to rounding in resampling)
|
|
228
|
+
if output_offset < expected_output_samples:
|
|
229
|
+
waveform = waveform[..., :output_offset]
|
|
230
|
+
|
|
231
|
+
return waveform
|
|
232
|
+
else:
|
|
233
|
+
# For shorter audio, use standard loading
|
|
234
|
+
waveform, sample_rate = sf.read(audio, always_2d=True, dtype="float32")
|
|
235
|
+
# Resample and return directly to avoid double processing
|
|
236
|
+
result = self._resample_audio(
|
|
237
|
+
(waveform, sample_rate),
|
|
238
|
+
sampling_rate,
|
|
239
|
+
device=self.device,
|
|
240
|
+
channel_selector=channel_selector,
|
|
241
|
+
)
|
|
242
|
+
del waveform
|
|
243
|
+
return result
|
|
135
244
|
except Exception as primary_error:
|
|
245
|
+
print(f"Primary error with soundfile: {primary_error}")
|
|
136
246
|
# Fallback to PyAV for formats not supported by soundfile
|
|
137
247
|
try:
|
|
138
248
|
import av
|
|
@@ -150,62 +260,151 @@ class AudioLoader:
|
|
|
150
260
|
if audio_stream is None:
|
|
151
261
|
raise ValueError(f"No audio stream found in file: {audio}")
|
|
152
262
|
|
|
153
|
-
# Resample to target sample rate during decoding
|
|
154
263
|
audio_stream.codec_context.format = av.AudioFormat("flt") # 32-bit float
|
|
155
|
-
|
|
156
|
-
frames = []
|
|
157
|
-
for frame in container.decode(audio_stream):
|
|
158
|
-
# Convert frame to numpy array
|
|
159
|
-
array = frame.to_ndarray()
|
|
160
|
-
# Ensure shape is (channels, samples)
|
|
161
|
-
if array.ndim == 1:
|
|
162
|
-
array = array.reshape(1, -1)
|
|
163
|
-
elif array.ndim == 2 and array.shape[0] > array.shape[1]:
|
|
164
|
-
array = array.T
|
|
165
|
-
frames.append(array)
|
|
166
|
-
|
|
167
|
-
container.close()
|
|
168
|
-
|
|
169
|
-
if not frames:
|
|
170
|
-
raise ValueError(f"No audio data found in file: {audio}")
|
|
171
|
-
|
|
172
|
-
# Concatenate all frames
|
|
173
|
-
waveform = np.concatenate(frames, axis=1).astype(np.float32) # (channels, samples)
|
|
174
264
|
sample_rate = audio_stream.codec_context.sample_rate
|
|
265
|
+
|
|
266
|
+
# Estimate duration to decide processing strategy
|
|
267
|
+
duration_estimate = None
|
|
268
|
+
if audio_stream.duration and audio_stream.time_base:
|
|
269
|
+
duration_estimate = float(audio_stream.duration * audio_stream.time_base)
|
|
270
|
+
else:
|
|
271
|
+
print(f"WARNING: Failed to estimate duration for audio: {audio}")
|
|
272
|
+
|
|
273
|
+
# For very long audio (>30 minutes), process and resample in chunks
|
|
274
|
+
if duration_estimate and duration_estimate > 1800:
|
|
275
|
+
# Estimate output size and pre-allocate with buffer
|
|
276
|
+
num_channels = 1 if channel_selector else audio_stream.codec_context.channels
|
|
277
|
+
estimated_samples = int(duration_estimate * sampling_rate * 1.1) # 10% buffer
|
|
278
|
+
waveform = np.zeros((num_channels, estimated_samples), dtype=np.float32)
|
|
279
|
+
|
|
280
|
+
frames = []
|
|
281
|
+
accumulated_samples = 0
|
|
282
|
+
output_offset = 0
|
|
283
|
+
chunk_sample_target = int(sample_rate * 600) # 10 minutes at original rate
|
|
284
|
+
|
|
285
|
+
for frame in container.decode(audio_stream):
|
|
286
|
+
array = frame.to_ndarray()
|
|
287
|
+
|
|
288
|
+
# Ensure shape is (samples, channels)
|
|
289
|
+
if array.ndim == 1:
|
|
290
|
+
array = array.reshape(-1, 1)
|
|
291
|
+
elif array.ndim == 2 and array.shape[0] < array.shape[1]:
|
|
292
|
+
array = array.T
|
|
293
|
+
|
|
294
|
+
frames.append(array)
|
|
295
|
+
accumulated_samples += array.shape[0]
|
|
296
|
+
|
|
297
|
+
# Process chunk when accumulated enough samples
|
|
298
|
+
if accumulated_samples >= chunk_sample_target:
|
|
299
|
+
chunk = np.concatenate(frames, axis=0).astype(np.float32)
|
|
300
|
+
del frames # Free frames list before resampling
|
|
301
|
+
# Resample chunk -> (channels, samples)
|
|
302
|
+
resampled_chunk = self._resample_audio(
|
|
303
|
+
(chunk, sample_rate),
|
|
304
|
+
sampling_rate,
|
|
305
|
+
device=self.device,
|
|
306
|
+
channel_selector=channel_selector,
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
chunk_length = resampled_chunk.shape[-1]
|
|
310
|
+
if output_offset + chunk_length > waveform.shape[-1]:
|
|
311
|
+
print(
|
|
312
|
+
f"WARNING: Trimming resampled chunk from {chunk_length} to {waveform.shape[-1] - output_offset} samples to fit waveform buffer for audio: {audio}" # noqa: E501
|
|
313
|
+
)
|
|
314
|
+
resampled_chunk = resampled_chunk[:, : waveform.shape[-1] - output_offset]
|
|
315
|
+
|
|
316
|
+
# Write directly to array
|
|
317
|
+
waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
|
|
318
|
+
output_offset += chunk_length
|
|
319
|
+
|
|
320
|
+
# Clean up immediately
|
|
321
|
+
del chunk, resampled_chunk
|
|
322
|
+
frames = [] # Create new list
|
|
323
|
+
accumulated_samples = 0
|
|
324
|
+
|
|
325
|
+
# Process remaining frames
|
|
326
|
+
if frames:
|
|
327
|
+
chunk = np.concatenate(frames, axis=0).astype(np.float32)
|
|
328
|
+
del frames
|
|
329
|
+
resampled_chunk = self._resample_audio(
|
|
330
|
+
(chunk, sample_rate),
|
|
331
|
+
sampling_rate,
|
|
332
|
+
device=self.device,
|
|
333
|
+
channel_selector=channel_selector,
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
chunk_length = resampled_chunk.shape[-1]
|
|
337
|
+
if output_offset + chunk_length > waveform.shape[-1]:
|
|
338
|
+
print(
|
|
339
|
+
f"WARNING: Trimming resampled chunk from {chunk_length} to {waveform.shape[-1] - output_offset} samples to fit waveform buffer for audio: {audio}" # noqa: E501
|
|
340
|
+
)
|
|
341
|
+
resampled_chunk = resampled_chunk[:, : waveform.shape[-1] - output_offset]
|
|
342
|
+
|
|
343
|
+
waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
|
|
344
|
+
output_offset += chunk_length
|
|
345
|
+
del chunk, resampled_chunk
|
|
346
|
+
|
|
347
|
+
container.close()
|
|
348
|
+
|
|
349
|
+
if output_offset == 0:
|
|
350
|
+
raise ValueError(f"No audio data found in file: {audio}")
|
|
351
|
+
|
|
352
|
+
# Trim to actual size
|
|
353
|
+
waveform = waveform[..., :output_offset]
|
|
354
|
+
return waveform
|
|
355
|
+
else:
|
|
356
|
+
# For shorter audio, process in batches to reduce memory
|
|
357
|
+
frames = []
|
|
358
|
+
for frame in container.decode(audio_stream):
|
|
359
|
+
array = frame.to_ndarray()
|
|
360
|
+
# Ensure shape is (channels, samples)
|
|
361
|
+
if array.ndim == 1:
|
|
362
|
+
array = array.reshape(-1, 1)
|
|
363
|
+
elif array.ndim == 2 and array.shape[0] < array.shape[1]:
|
|
364
|
+
array = array.T
|
|
365
|
+
frames.append(array)
|
|
366
|
+
container.close()
|
|
367
|
+
|
|
368
|
+
if not frames:
|
|
369
|
+
raise ValueError(f"No audio data found in file: {audio}")
|
|
370
|
+
|
|
371
|
+
# Concatenate remaining frames
|
|
372
|
+
waveform = np.concatenate(frames, axis=0).astype(np.float32)
|
|
373
|
+
del frames
|
|
374
|
+
# Resample and return directly
|
|
375
|
+
result = self._resample_audio(
|
|
376
|
+
(waveform, sample_rate),
|
|
377
|
+
sampling_rate,
|
|
378
|
+
device=self.device,
|
|
379
|
+
channel_selector=channel_selector,
|
|
380
|
+
)
|
|
381
|
+
del waveform
|
|
382
|
+
return result
|
|
175
383
|
except Exception as e:
|
|
176
384
|
raise RuntimeError(f"Failed to load audio file {audio}: {e}")
|
|
177
385
|
|
|
178
|
-
return self._resample_audio(
|
|
179
|
-
(torch.from_numpy(waveform), sample_rate),
|
|
180
|
-
sampling_rate,
|
|
181
|
-
device=self.device,
|
|
182
|
-
channel_selector=channel_selector,
|
|
183
|
-
)
|
|
184
|
-
|
|
185
386
|
def __call__(
|
|
186
387
|
self,
|
|
187
388
|
audio: Union[Pathlike, BinaryIO],
|
|
188
389
|
sampling_rate: int = 16000,
|
|
189
390
|
channel_selector: Optional[ChannelSelectorType] = "average",
|
|
391
|
+
streaming_chunk_secs: Optional[float] = None,
|
|
190
392
|
) -> AudioData:
|
|
191
393
|
"""
|
|
192
394
|
Args:
|
|
193
395
|
audio: Path to audio file or binary stream.
|
|
194
396
|
channel_selector: How to select channels (default: "average").
|
|
195
397
|
sampling_rate: Target sampling rate (default: use instance sampling_rate).
|
|
398
|
+
streaming_chunk_secs: Duration in seconds for streaming chunks (default: None, disabled).
|
|
196
399
|
|
|
197
400
|
Returns:
|
|
198
|
-
AudioData namedtuple with sampling_rate, ndarray, and
|
|
401
|
+
AudioData namedtuple with sampling_rate, ndarray, and streaming_chunk_secs fields.
|
|
199
402
|
"""
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
# tensor is (1, T) or (C, T)
|
|
203
|
-
ndarray = tensor.cpu().numpy()
|
|
204
|
-
|
|
403
|
+
ndarray = self._load_audio(audio, sampling_rate, channel_selector)
|
|
205
404
|
return AudioData(
|
|
206
405
|
sampling_rate=sampling_rate,
|
|
207
406
|
ndarray=ndarray,
|
|
208
|
-
tensor=tensor,
|
|
209
|
-
device=self.device,
|
|
210
407
|
path=str(audio) if isinstance(audio, Pathlike) else "<BinaryIO>",
|
|
408
|
+
streaming_chunk_secs=streaming_chunk_secs,
|
|
409
|
+
overlap_secs=0.0,
|
|
211
410
|
)
|