lattifai 1.0.4__py3-none-any.whl → 1.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/__init__.py +10 -0
- lattifai/alignment/lattice1_aligner.py +33 -13
- lattifai/alignment/lattice1_worker.py +121 -50
- lattifai/alignment/segmenter.py +3 -2
- lattifai/alignment/tokenizer.py +3 -3
- lattifai/audio2.py +269 -70
- lattifai/caption/caption.py +161 -3
- lattifai/cli/alignment.py +2 -1
- lattifai/cli/app_installer.py +35 -33
- lattifai/cli/caption.py +8 -18
- lattifai/cli/server.py +3 -1
- lattifai/cli/transcribe.py +53 -38
- lattifai/cli/youtube.py +1 -0
- lattifai/client.py +16 -11
- lattifai/config/alignment.py +23 -2
- lattifai/config/caption.py +1 -1
- lattifai/config/media.py +23 -3
- lattifai/errors.py +7 -3
- lattifai/mixin.py +26 -15
- lattifai/server/app.py +2 -1
- lattifai/utils.py +37 -0
- lattifai/workflow/file_manager.py +15 -13
- lattifai/workflow/youtube.py +16 -1
- {lattifai-1.0.4.dist-info → lattifai-1.0.5.dist-info}/METADATA +65 -15
- {lattifai-1.0.4.dist-info → lattifai-1.0.5.dist-info}/RECORD +29 -29
- {lattifai-1.0.4.dist-info → lattifai-1.0.5.dist-info}/licenses/LICENSE +1 -1
- {lattifai-1.0.4.dist-info → lattifai-1.0.5.dist-info}/WHEEL +0 -0
- {lattifai-1.0.4.dist-info → lattifai-1.0.5.dist-info}/entry_points.txt +0 -0
- {lattifai-1.0.4.dist-info → lattifai-1.0.5.dist-info}/top_level.txt +0 -0
lattifai/audio2.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from collections import namedtuple
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import BinaryIO,
|
|
5
|
+
from typing import BinaryIO, Optional, Tuple, Union
|
|
6
6
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
import soundfile as sf
|
|
@@ -16,8 +16,14 @@ from lattifai.errors import AudioLoadError
|
|
|
16
16
|
ChannelSelectorType = Union[int, str]
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
class AudioData(namedtuple("AudioData", ["sampling_rate", "ndarray", "
|
|
20
|
-
"""Audio data container with sampling rate
|
|
19
|
+
class AudioData(namedtuple("AudioData", ["sampling_rate", "ndarray", "path", "streaming_chunk_secs", "overlap_secs"])):
|
|
20
|
+
"""Audio data container with sampling rate and numpy array.
|
|
21
|
+
|
|
22
|
+
Supports iteration to stream audio chunks for processing long audio files.
|
|
23
|
+
The streaming_chunk_secs field indicates whether streaming mode should be used downstream.
|
|
24
|
+
The overlap_secs field specifies the overlap duration between consecutive chunks.
|
|
25
|
+
Note: tensor field removed to reduce memory usage. Convert ndarray to tensor on-demand.
|
|
26
|
+
"""
|
|
21
27
|
|
|
22
28
|
def __str__(self) -> str:
|
|
23
29
|
return self.path
|
|
@@ -27,6 +33,66 @@ class AudioData(namedtuple("AudioData", ["sampling_rate", "ndarray", "tensor", "
|
|
|
27
33
|
"""Duration of the audio in seconds."""
|
|
28
34
|
return self.ndarray.shape[-1] / self.sampling_rate
|
|
29
35
|
|
|
36
|
+
@property
|
|
37
|
+
def streaming_mode(self) -> bool:
|
|
38
|
+
"""Indicates whether streaming mode is enabled based on streaming_chunk_secs."""
|
|
39
|
+
if self.streaming_chunk_secs is not None:
|
|
40
|
+
return self.duration > self.streaming_chunk_secs * 1.1
|
|
41
|
+
return False
|
|
42
|
+
|
|
43
|
+
def __iter__(self):
|
|
44
|
+
"""Initialize iterator for chunk-based audio streaming.
|
|
45
|
+
|
|
46
|
+
Returns an iterator that yields audio chunks as AudioData instances.
|
|
47
|
+
Uses streaming_chunk_secs and overlap_secs from the instance.
|
|
48
|
+
"""
|
|
49
|
+
return self.iter_chunks()
|
|
50
|
+
|
|
51
|
+
def iter_chunks(
|
|
52
|
+
self,
|
|
53
|
+
chunk_secs: Optional[float] = None,
|
|
54
|
+
overlap_secs: Optional[float] = None,
|
|
55
|
+
):
|
|
56
|
+
"""Iterate over audio chunks with configurable duration and overlap.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
chunk_secs: Duration of each chunk in seconds (default: uses streaming_chunk_secs or 600.0).
|
|
60
|
+
overlap_secs: Overlap between consecutive chunks in seconds (default: uses overlap_secs or 0.0).
|
|
61
|
+
|
|
62
|
+
Yields:
|
|
63
|
+
AudioData: Chunks of audio data.
|
|
64
|
+
|
|
65
|
+
Example:
|
|
66
|
+
>>> audio = loader("long_audio.wav")
|
|
67
|
+
>>> for chunk in audio.iter_chunks(chunk_secs=60.0, overlap_secs=2.0):
|
|
68
|
+
... process(chunk)
|
|
69
|
+
"""
|
|
70
|
+
chunk_duration = chunk_secs or self.streaming_chunk_secs or 600.0
|
|
71
|
+
overlap_duration = overlap_secs or self.overlap_secs or 0.0
|
|
72
|
+
|
|
73
|
+
chunk_size = int(chunk_duration * self.sampling_rate)
|
|
74
|
+
overlap_size = int(overlap_duration * self.sampling_rate)
|
|
75
|
+
step_size = chunk_size - overlap_size
|
|
76
|
+
total_samples = self.ndarray.shape[-1]
|
|
77
|
+
|
|
78
|
+
current_offset = 0
|
|
79
|
+
while current_offset < total_samples:
|
|
80
|
+
start = current_offset
|
|
81
|
+
end = min(start + chunk_size, total_samples)
|
|
82
|
+
|
|
83
|
+
# Extract chunk from ndarray only
|
|
84
|
+
chunk_ndarray = self.ndarray[..., start:end]
|
|
85
|
+
|
|
86
|
+
yield AudioData(
|
|
87
|
+
sampling_rate=self.sampling_rate,
|
|
88
|
+
ndarray=chunk_ndarray,
|
|
89
|
+
path=f"{self.path}[{start/self.sampling_rate:.2f}s-{end/self.sampling_rate:.2f}s]",
|
|
90
|
+
streaming_chunk_secs=None,
|
|
91
|
+
overlap_secs=None,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
current_offset += step_size
|
|
95
|
+
|
|
30
96
|
|
|
31
97
|
class AudioLoader:
|
|
32
98
|
"""Load and preprocess audio files into AudioData format."""
|
|
@@ -45,62 +111,48 @@ class AudioLoader:
|
|
|
45
111
|
|
|
46
112
|
def _resample_audio(
|
|
47
113
|
self,
|
|
48
|
-
audio_sr: Tuple[
|
|
114
|
+
audio_sr: Tuple[np.ndarray, int],
|
|
49
115
|
sampling_rate: int,
|
|
50
116
|
device: Optional[str],
|
|
51
117
|
channel_selector: Optional[ChannelSelectorType],
|
|
52
|
-
) ->
|
|
118
|
+
) -> np.ndarray:
|
|
53
119
|
"""Resample audio to target sampling rate with channel selection.
|
|
54
120
|
|
|
55
121
|
Args:
|
|
56
|
-
audio_sr: Tuple of (
|
|
122
|
+
audio_sr: Tuple of (audio, original_sample_rate).
|
|
57
123
|
sampling_rate: Target sampling rate.
|
|
58
124
|
device: Device to perform resampling on.
|
|
59
125
|
channel_selector: How to select channels.
|
|
60
126
|
|
|
61
127
|
Returns:
|
|
62
|
-
Resampled audio
|
|
128
|
+
Resampled audio array of shape (1, T) or (C, T).
|
|
63
129
|
"""
|
|
64
130
|
audio, sr = audio_sr
|
|
65
131
|
|
|
66
132
|
if channel_selector is None:
|
|
67
133
|
# keep the original multi-channel signal
|
|
68
|
-
tensor = audio
|
|
134
|
+
tensor = audio.T
|
|
135
|
+
del audio # Free original audio memory
|
|
69
136
|
elif isinstance(channel_selector, int):
|
|
70
|
-
assert audio.shape[
|
|
71
|
-
tensor = audio[channel_selector : channel_selector + 1].
|
|
137
|
+
assert audio.shape[1] >= channel_selector, f"Invalid channel: {channel_selector}"
|
|
138
|
+
tensor = audio[:, channel_selector : channel_selector + 1].T.copy()
|
|
72
139
|
del audio
|
|
73
140
|
elif isinstance(channel_selector, str):
|
|
74
141
|
assert channel_selector == "average"
|
|
75
|
-
tensor =
|
|
142
|
+
tensor = np.mean(audio, axis=1, keepdims=True).T
|
|
76
143
|
del audio
|
|
77
144
|
else:
|
|
78
145
|
raise ValueError(f"Unsupported channel_selector: {channel_selector}")
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
# print(f"Selecting channels {channel_selector} from the signal with {num_channels} channels.")
|
|
82
|
-
# if max(channel_selector) >= num_channels:
|
|
83
|
-
# raise ValueError(
|
|
84
|
-
# f"Cannot select channel subset {channel_selector} from a signal with {num_channels} channels."
|
|
85
|
-
# )
|
|
86
|
-
# tensor = audio[channel_selector]
|
|
87
|
-
|
|
88
|
-
tensor = tensor.to(device)
|
|
146
|
+
|
|
147
|
+
# tensor: np.ndarray (channels, samples)
|
|
89
148
|
if sr != sampling_rate:
|
|
90
149
|
cache_key = (sr, sampling_rate, device)
|
|
91
150
|
if cache_key not in self._resampler_cache:
|
|
92
151
|
self._resampler_cache[cache_key] = get_or_create_resampler(sr, sampling_rate).to(device=device)
|
|
93
152
|
resampler = self._resampler_cache[cache_key]
|
|
94
153
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
if length > chunk_size:
|
|
98
|
-
resampled_chunks = []
|
|
99
|
-
for i in range(0, length, chunk_size):
|
|
100
|
-
resampled_chunks.append(resampler(tensor[..., i : i + chunk_size]))
|
|
101
|
-
tensor = torch.cat(resampled_chunks, dim=-1)
|
|
102
|
-
else:
|
|
103
|
-
tensor = resampler(tensor)
|
|
154
|
+
tensor = resampler(torch.from_numpy(tensor).to(device=device))
|
|
155
|
+
tensor = tensor.cpu().numpy()
|
|
104
156
|
|
|
105
157
|
return tensor
|
|
106
158
|
|
|
@@ -109,7 +161,7 @@ class AudioLoader:
|
|
|
109
161
|
audio: Union[Pathlike, BinaryIO],
|
|
110
162
|
sampling_rate: int,
|
|
111
163
|
channel_selector: Optional[ChannelSelectorType],
|
|
112
|
-
) ->
|
|
164
|
+
) -> np.ndarray:
|
|
113
165
|
"""Load audio from file or binary stream and resample to target rate.
|
|
114
166
|
|
|
115
167
|
Args:
|
|
@@ -118,7 +170,7 @@ class AudioLoader:
|
|
|
118
170
|
channel_selector: How to select channels.
|
|
119
171
|
|
|
120
172
|
Returns:
|
|
121
|
-
Resampled audio
|
|
173
|
+
Resampled audio as a NumPy array of shape (channels, samples).
|
|
122
174
|
|
|
123
175
|
Raises:
|
|
124
176
|
ImportError: If PyAV is needed but not installed.
|
|
@@ -128,11 +180,69 @@ class AudioLoader:
|
|
|
128
180
|
if isinstance(audio, Pathlike):
|
|
129
181
|
audio = str(Path(str(audio)).expanduser())
|
|
130
182
|
|
|
131
|
-
# load audio
|
|
183
|
+
# load audio in chunks to reduce memory footprint for long files
|
|
132
184
|
try:
|
|
133
|
-
|
|
134
|
-
|
|
185
|
+
# First check file duration to decide loading strategy
|
|
186
|
+
info = sf.info(audio)
|
|
187
|
+
duration = info.duration
|
|
188
|
+
|
|
189
|
+
# For very long audio (>60 minutes), use chunk-based loading
|
|
190
|
+
if duration > 3600: # 60 minutes
|
|
191
|
+
with sf.SoundFile(audio, "r") as f:
|
|
192
|
+
sample_rate = f.samplerate
|
|
193
|
+
total_frames = f.frames
|
|
194
|
+
|
|
195
|
+
# Pre-calculate output size to avoid list accumulation
|
|
196
|
+
num_channels = 1 if channel_selector else f.channels
|
|
197
|
+
expected_output_samples = int(total_frames * sampling_rate / sample_rate)
|
|
198
|
+
|
|
199
|
+
# Pre-allocate output array
|
|
200
|
+
waveform = np.zeros((num_channels, expected_output_samples), dtype=np.float32)
|
|
201
|
+
|
|
202
|
+
# Use source sample rate for reading, not target
|
|
203
|
+
chunk_frames = int(sample_rate * 1800) # 30-minute chunks at source rate
|
|
204
|
+
output_offset = 0
|
|
205
|
+
|
|
206
|
+
while True:
|
|
207
|
+
chunk = f.read(frames=chunk_frames, dtype="float32", always_2d=True)
|
|
208
|
+
if chunk.size == 0:
|
|
209
|
+
break
|
|
210
|
+
|
|
211
|
+
# Resample chunk -> (channels, samples)
|
|
212
|
+
resampled_chunk = self._resample_audio(
|
|
213
|
+
(chunk, sample_rate),
|
|
214
|
+
sampling_rate,
|
|
215
|
+
device=self.device,
|
|
216
|
+
channel_selector=channel_selector,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
# Write directly to pre-allocated array
|
|
220
|
+
chunk_length = resampled_chunk.shape[-1]
|
|
221
|
+
waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
|
|
222
|
+
output_offset += chunk_length
|
|
223
|
+
|
|
224
|
+
# Clean up immediately
|
|
225
|
+
del chunk, resampled_chunk
|
|
226
|
+
|
|
227
|
+
# Trim to actual size if needed (due to rounding in resampling)
|
|
228
|
+
if output_offset < expected_output_samples:
|
|
229
|
+
waveform = waveform[..., :output_offset]
|
|
230
|
+
|
|
231
|
+
return waveform
|
|
232
|
+
else:
|
|
233
|
+
# For shorter audio, use standard loading
|
|
234
|
+
waveform, sample_rate = sf.read(audio, always_2d=True, dtype="float32")
|
|
235
|
+
# Resample and return directly to avoid double processing
|
|
236
|
+
result = self._resample_audio(
|
|
237
|
+
(waveform, sample_rate),
|
|
238
|
+
sampling_rate,
|
|
239
|
+
device=self.device,
|
|
240
|
+
channel_selector=channel_selector,
|
|
241
|
+
)
|
|
242
|
+
del waveform
|
|
243
|
+
return result
|
|
135
244
|
except Exception as primary_error:
|
|
245
|
+
print(f"Primary error with soundfile: {primary_error}")
|
|
136
246
|
# Fallback to PyAV for formats not supported by soundfile
|
|
137
247
|
try:
|
|
138
248
|
import av
|
|
@@ -150,62 +260,151 @@ class AudioLoader:
|
|
|
150
260
|
if audio_stream is None:
|
|
151
261
|
raise ValueError(f"No audio stream found in file: {audio}")
|
|
152
262
|
|
|
153
|
-
# Resample to target sample rate during decoding
|
|
154
263
|
audio_stream.codec_context.format = av.AudioFormat("flt") # 32-bit float
|
|
155
|
-
|
|
156
|
-
frames = []
|
|
157
|
-
for frame in container.decode(audio_stream):
|
|
158
|
-
# Convert frame to numpy array
|
|
159
|
-
array = frame.to_ndarray()
|
|
160
|
-
# Ensure shape is (channels, samples)
|
|
161
|
-
if array.ndim == 1:
|
|
162
|
-
array = array.reshape(1, -1)
|
|
163
|
-
elif array.ndim == 2 and array.shape[0] > array.shape[1]:
|
|
164
|
-
array = array.T
|
|
165
|
-
frames.append(array)
|
|
166
|
-
|
|
167
|
-
container.close()
|
|
168
|
-
|
|
169
|
-
if not frames:
|
|
170
|
-
raise ValueError(f"No audio data found in file: {audio}")
|
|
171
|
-
|
|
172
|
-
# Concatenate all frames
|
|
173
|
-
waveform = np.concatenate(frames, axis=1).astype(np.float32) # (channels, samples)
|
|
174
264
|
sample_rate = audio_stream.codec_context.sample_rate
|
|
265
|
+
|
|
266
|
+
# Estimate duration to decide processing strategy
|
|
267
|
+
duration_estimate = None
|
|
268
|
+
if audio_stream.duration and audio_stream.time_base:
|
|
269
|
+
duration_estimate = float(audio_stream.duration * audio_stream.time_base)
|
|
270
|
+
else:
|
|
271
|
+
print(f"WARNING: Failed to estimate duration for audio: {audio}")
|
|
272
|
+
|
|
273
|
+
# For very long audio (>30 minutes), process and resample in chunks
|
|
274
|
+
if duration_estimate and duration_estimate > 1800:
|
|
275
|
+
# Estimate output size and pre-allocate with buffer
|
|
276
|
+
num_channels = 1 if channel_selector else audio_stream.codec_context.channels
|
|
277
|
+
estimated_samples = int(duration_estimate * sampling_rate * 1.1) # 10% buffer
|
|
278
|
+
waveform = np.zeros((num_channels, estimated_samples), dtype=np.float32)
|
|
279
|
+
|
|
280
|
+
frames = []
|
|
281
|
+
accumulated_samples = 0
|
|
282
|
+
output_offset = 0
|
|
283
|
+
chunk_sample_target = int(sample_rate * 600) # 10 minutes at original rate
|
|
284
|
+
|
|
285
|
+
for frame in container.decode(audio_stream):
|
|
286
|
+
array = frame.to_ndarray()
|
|
287
|
+
|
|
288
|
+
# Ensure shape is (samples, channels)
|
|
289
|
+
if array.ndim == 1:
|
|
290
|
+
array = array.reshape(-1, 1)
|
|
291
|
+
elif array.ndim == 2 and array.shape[0] < array.shape[1]:
|
|
292
|
+
array = array.T
|
|
293
|
+
|
|
294
|
+
frames.append(array)
|
|
295
|
+
accumulated_samples += array.shape[0]
|
|
296
|
+
|
|
297
|
+
# Process chunk when accumulated enough samples
|
|
298
|
+
if accumulated_samples >= chunk_sample_target:
|
|
299
|
+
chunk = np.concatenate(frames, axis=0).astype(np.float32)
|
|
300
|
+
del frames # Free frames list before resampling
|
|
301
|
+
# Resample chunk -> (channels, samples)
|
|
302
|
+
resampled_chunk = self._resample_audio(
|
|
303
|
+
(chunk, sample_rate),
|
|
304
|
+
sampling_rate,
|
|
305
|
+
device=self.device,
|
|
306
|
+
channel_selector=channel_selector,
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
chunk_length = resampled_chunk.shape[-1]
|
|
310
|
+
if output_offset + chunk_length > waveform.shape[-1]:
|
|
311
|
+
print(
|
|
312
|
+
f"WARNING: Trimming resampled chunk from {chunk_length} to {waveform.shape[-1] - output_offset} samples to fit waveform buffer for audio: {audio}" # noqa: E501
|
|
313
|
+
)
|
|
314
|
+
resampled_chunk = resampled_chunk[:, : waveform.shape[-1] - output_offset]
|
|
315
|
+
|
|
316
|
+
# Write directly to array
|
|
317
|
+
waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
|
|
318
|
+
output_offset += chunk_length
|
|
319
|
+
|
|
320
|
+
# Clean up immediately
|
|
321
|
+
del chunk, resampled_chunk
|
|
322
|
+
frames = [] # Create new list
|
|
323
|
+
accumulated_samples = 0
|
|
324
|
+
|
|
325
|
+
# Process remaining frames
|
|
326
|
+
if frames:
|
|
327
|
+
chunk = np.concatenate(frames, axis=0).astype(np.float32)
|
|
328
|
+
del frames
|
|
329
|
+
resampled_chunk = self._resample_audio(
|
|
330
|
+
(chunk, sample_rate),
|
|
331
|
+
sampling_rate,
|
|
332
|
+
device=self.device,
|
|
333
|
+
channel_selector=channel_selector,
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
chunk_length = resampled_chunk.shape[-1]
|
|
337
|
+
if output_offset + chunk_length > waveform.shape[-1]:
|
|
338
|
+
print(
|
|
339
|
+
f"WARNING: Trimming resampled chunk from {chunk_length} to {waveform.shape[-1] - output_offset} samples to fit waveform buffer for audio: {audio}" # noqa: E501
|
|
340
|
+
)
|
|
341
|
+
resampled_chunk = resampled_chunk[:, : waveform.shape[-1] - output_offset]
|
|
342
|
+
|
|
343
|
+
waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
|
|
344
|
+
output_offset += chunk_length
|
|
345
|
+
del chunk, resampled_chunk
|
|
346
|
+
|
|
347
|
+
container.close()
|
|
348
|
+
|
|
349
|
+
if output_offset == 0:
|
|
350
|
+
raise ValueError(f"No audio data found in file: {audio}")
|
|
351
|
+
|
|
352
|
+
# Trim to actual size
|
|
353
|
+
waveform = waveform[..., :output_offset]
|
|
354
|
+
return waveform
|
|
355
|
+
else:
|
|
356
|
+
# For shorter audio, process in batches to reduce memory
|
|
357
|
+
frames = []
|
|
358
|
+
for frame in container.decode(audio_stream):
|
|
359
|
+
array = frame.to_ndarray()
|
|
360
|
+
# Ensure shape is (channels, samples)
|
|
361
|
+
if array.ndim == 1:
|
|
362
|
+
array = array.reshape(-1, 1)
|
|
363
|
+
elif array.ndim == 2 and array.shape[0] < array.shape[1]:
|
|
364
|
+
array = array.T
|
|
365
|
+
frames.append(array)
|
|
366
|
+
container.close()
|
|
367
|
+
|
|
368
|
+
if not frames:
|
|
369
|
+
raise ValueError(f"No audio data found in file: {audio}")
|
|
370
|
+
|
|
371
|
+
# Concatenate remaining frames
|
|
372
|
+
waveform = np.concatenate(frames, axis=0).astype(np.float32)
|
|
373
|
+
del frames
|
|
374
|
+
# Resample and return directly
|
|
375
|
+
result = self._resample_audio(
|
|
376
|
+
(waveform, sample_rate),
|
|
377
|
+
sampling_rate,
|
|
378
|
+
device=self.device,
|
|
379
|
+
channel_selector=channel_selector,
|
|
380
|
+
)
|
|
381
|
+
del waveform
|
|
382
|
+
return result
|
|
175
383
|
except Exception as e:
|
|
176
384
|
raise RuntimeError(f"Failed to load audio file {audio}: {e}")
|
|
177
385
|
|
|
178
|
-
return self._resample_audio(
|
|
179
|
-
(torch.from_numpy(waveform), sample_rate),
|
|
180
|
-
sampling_rate,
|
|
181
|
-
device=self.device,
|
|
182
|
-
channel_selector=channel_selector,
|
|
183
|
-
)
|
|
184
|
-
|
|
185
386
|
def __call__(
|
|
186
387
|
self,
|
|
187
388
|
audio: Union[Pathlike, BinaryIO],
|
|
188
389
|
sampling_rate: int = 16000,
|
|
189
390
|
channel_selector: Optional[ChannelSelectorType] = "average",
|
|
391
|
+
streaming_chunk_secs: Optional[float] = None,
|
|
190
392
|
) -> AudioData:
|
|
191
393
|
"""
|
|
192
394
|
Args:
|
|
193
395
|
audio: Path to audio file or binary stream.
|
|
194
396
|
channel_selector: How to select channels (default: "average").
|
|
195
397
|
sampling_rate: Target sampling rate (default: use instance sampling_rate).
|
|
398
|
+
streaming_chunk_secs: Duration in seconds for streaming chunks (default: None, disabled).
|
|
196
399
|
|
|
197
400
|
Returns:
|
|
198
|
-
AudioData namedtuple with sampling_rate, ndarray, and
|
|
401
|
+
AudioData namedtuple with sampling_rate, ndarray, and streaming_chunk_secs fields.
|
|
199
402
|
"""
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
# tensor is (1, T) or (C, T)
|
|
203
|
-
ndarray = tensor.cpu().numpy()
|
|
204
|
-
|
|
403
|
+
ndarray = self._load_audio(audio, sampling_rate, channel_selector)
|
|
205
404
|
return AudioData(
|
|
206
405
|
sampling_rate=sampling_rate,
|
|
207
406
|
ndarray=ndarray,
|
|
208
|
-
tensor=tensor,
|
|
209
|
-
device=self.device,
|
|
210
407
|
path=str(audio) if isinstance(audio, Pathlike) else "<BinaryIO>",
|
|
408
|
+
streaming_chunk_secs=streaming_chunk_secs,
|
|
409
|
+
overlap_secs=0.0,
|
|
211
410
|
)
|
lattifai/caption/caption.py
CHANGED
|
@@ -307,7 +307,7 @@ class Caption:
|
|
|
307
307
|
cls,
|
|
308
308
|
path: Pathlike,
|
|
309
309
|
format: Optional[str] = None,
|
|
310
|
-
normalize_text: bool =
|
|
310
|
+
normalize_text: bool = True,
|
|
311
311
|
) -> "Caption":
|
|
312
312
|
"""
|
|
313
313
|
Read caption file and return Caption object.
|
|
@@ -505,6 +505,8 @@ class Caption:
|
|
|
505
505
|
cls._write_csv(alignments, output_path, include_speaker_in_text)
|
|
506
506
|
elif str(output_path)[-4:].lower() == ".aud":
|
|
507
507
|
cls._write_aud(alignments, output_path, include_speaker_in_text)
|
|
508
|
+
elif str(output_path)[-4:].lower() == ".sbv":
|
|
509
|
+
cls._write_sbv(alignments, output_path, include_speaker_in_text)
|
|
508
510
|
else:
|
|
509
511
|
import pysubs2
|
|
510
512
|
|
|
@@ -535,7 +537,14 @@ class Caption:
|
|
|
535
537
|
name=sup.speaker or "",
|
|
536
538
|
)
|
|
537
539
|
)
|
|
538
|
-
|
|
540
|
+
|
|
541
|
+
# MicroDVD format requires framerate to be specified
|
|
542
|
+
output_ext = str(output_path).lower().split(".")[-1]
|
|
543
|
+
if output_ext == "sub":
|
|
544
|
+
# Default to 25 fps for MicroDVD format if not specified
|
|
545
|
+
subs.save(output_path, fps=25.0)
|
|
546
|
+
else:
|
|
547
|
+
subs.save(output_path)
|
|
539
548
|
|
|
540
549
|
return output_path
|
|
541
550
|
|
|
@@ -821,7 +830,7 @@ class Caption:
|
|
|
821
830
|
if cls._is_youtube_vtt_with_word_timestamps(content):
|
|
822
831
|
return cls._parse_youtube_vtt_with_word_timestamps(content, normalize_text)
|
|
823
832
|
|
|
824
|
-
if format == "gemini" or str(caption).endswith("Gemini.md"):
|
|
833
|
+
if format == "gemini" or str(caption).endswith("Gemini.md") or str(caption).endswith("Gemini3.md"):
|
|
825
834
|
from .gemini_reader import GeminiReader
|
|
826
835
|
|
|
827
836
|
supervisions = GeminiReader.extract_for_alignment(caption)
|
|
@@ -850,6 +859,8 @@ class Caption:
|
|
|
850
859
|
supervisions = cls._parse_csv(caption, normalize_text)
|
|
851
860
|
elif format == "aud" or str(caption)[-4:].lower() == ".aud":
|
|
852
861
|
supervisions = cls._parse_aud(caption, normalize_text)
|
|
862
|
+
elif format == "sbv" or str(caption)[-4:].lower() == ".sbv":
|
|
863
|
+
supervisions = cls._parse_sbv(caption, normalize_text)
|
|
853
864
|
elif format == "txt" or (format == "auto" and str(caption)[-4:].lower() == ".txt"):
|
|
854
865
|
if not Path(str(caption)).exists(): # str
|
|
855
866
|
lines = [line.strip() for line in str(caption).split("\n")]
|
|
@@ -1113,6 +1124,101 @@ class Caption:
|
|
|
1113
1124
|
|
|
1114
1125
|
return supervisions
|
|
1115
1126
|
|
|
1127
|
+
@classmethod
|
|
1128
|
+
def _parse_sbv(cls, caption: Pathlike, normalize_text: Optional[bool] = False) -> List[Supervision]:
|
|
1129
|
+
"""
|
|
1130
|
+
Parse SubViewer (SBV) format caption file.
|
|
1131
|
+
|
|
1132
|
+
Format:
|
|
1133
|
+
0:00:00.000,0:00:02.000
|
|
1134
|
+
Text line 1
|
|
1135
|
+
|
|
1136
|
+
0:00:02.000,0:00:04.000
|
|
1137
|
+
Text line 2
|
|
1138
|
+
|
|
1139
|
+
Args:
|
|
1140
|
+
caption: Caption file path
|
|
1141
|
+
normalize_text: Whether to normalize text
|
|
1142
|
+
|
|
1143
|
+
Returns:
|
|
1144
|
+
List of Supervision objects
|
|
1145
|
+
"""
|
|
1146
|
+
caption_path = Path(str(caption))
|
|
1147
|
+
if not caption_path.exists():
|
|
1148
|
+
raise FileNotFoundError(f"Caption file not found: {caption}")
|
|
1149
|
+
|
|
1150
|
+
supervisions = []
|
|
1151
|
+
|
|
1152
|
+
with open(caption_path, "r", encoding="utf-8") as f:
|
|
1153
|
+
content = f.read()
|
|
1154
|
+
|
|
1155
|
+
# Split by double newlines to separate entries
|
|
1156
|
+
entries = content.strip().split("\n\n")
|
|
1157
|
+
|
|
1158
|
+
for entry in entries:
|
|
1159
|
+
lines = entry.strip().split("\n")
|
|
1160
|
+
if len(lines) < 2:
|
|
1161
|
+
continue
|
|
1162
|
+
|
|
1163
|
+
# First line: timestamp (H:MM:SS.mmm,H:MM:SS.mmm)
|
|
1164
|
+
timestamp_line = lines[0].strip()
|
|
1165
|
+
# Remaining lines: text
|
|
1166
|
+
text_lines = lines[1:]
|
|
1167
|
+
|
|
1168
|
+
try:
|
|
1169
|
+
# Parse timestamp: 0:00:00.000,0:00:02.000
|
|
1170
|
+
if "," not in timestamp_line:
|
|
1171
|
+
continue
|
|
1172
|
+
|
|
1173
|
+
start_str, end_str = timestamp_line.split(",", 1)
|
|
1174
|
+
|
|
1175
|
+
# Parse start time
|
|
1176
|
+
start_parts = start_str.strip().split(":")
|
|
1177
|
+
if len(start_parts) == 3:
|
|
1178
|
+
h, m, s = start_parts
|
|
1179
|
+
s_parts = s.split(".")
|
|
1180
|
+
start = int(h) * 3600 + int(m) * 60 + int(s_parts[0])
|
|
1181
|
+
if len(s_parts) > 1:
|
|
1182
|
+
start += int(s_parts[1]) / 1000.0
|
|
1183
|
+
else:
|
|
1184
|
+
continue
|
|
1185
|
+
|
|
1186
|
+
# Parse end time
|
|
1187
|
+
end_parts = end_str.strip().split(":")
|
|
1188
|
+
if len(end_parts) == 3:
|
|
1189
|
+
h, m, s = end_parts
|
|
1190
|
+
s_parts = s.split(".")
|
|
1191
|
+
end = int(h) * 3600 + int(m) * 60 + int(s_parts[0])
|
|
1192
|
+
if len(s_parts) > 1:
|
|
1193
|
+
end += int(s_parts[1]) / 1000.0
|
|
1194
|
+
else:
|
|
1195
|
+
continue
|
|
1196
|
+
|
|
1197
|
+
# Parse text and speaker
|
|
1198
|
+
text = " ".join(text_lines).strip()
|
|
1199
|
+
speaker, text = parse_speaker_text(text)
|
|
1200
|
+
|
|
1201
|
+
if normalize_text:
|
|
1202
|
+
text = normalize_text_fn(text)
|
|
1203
|
+
|
|
1204
|
+
duration = end - start
|
|
1205
|
+
if duration < 0:
|
|
1206
|
+
continue
|
|
1207
|
+
|
|
1208
|
+
supervisions.append(
|
|
1209
|
+
Supervision(
|
|
1210
|
+
text=text,
|
|
1211
|
+
start=start,
|
|
1212
|
+
duration=duration,
|
|
1213
|
+
speaker=speaker,
|
|
1214
|
+
)
|
|
1215
|
+
)
|
|
1216
|
+
except (ValueError, IndexError):
|
|
1217
|
+
# Skip malformed entries
|
|
1218
|
+
continue
|
|
1219
|
+
|
|
1220
|
+
return supervisions
|
|
1221
|
+
|
|
1116
1222
|
@classmethod
|
|
1117
1223
|
def _write_tsv(
|
|
1118
1224
|
cls,
|
|
@@ -1217,6 +1323,58 @@ class Caption:
|
|
|
1217
1323
|
|
|
1218
1324
|
file.write(f"{start}\t{end}\t{text}\n")
|
|
1219
1325
|
|
|
1326
|
+
@classmethod
|
|
1327
|
+
def _write_sbv(
|
|
1328
|
+
cls,
|
|
1329
|
+
alignments: List[Supervision],
|
|
1330
|
+
output_path: Pathlike,
|
|
1331
|
+
include_speaker_in_text: bool = True,
|
|
1332
|
+
) -> None:
|
|
1333
|
+
"""
|
|
1334
|
+
Write caption to SubViewer (SBV) format.
|
|
1335
|
+
|
|
1336
|
+
Format:
|
|
1337
|
+
0:00:00.000,0:00:02.000
|
|
1338
|
+
Text line 1
|
|
1339
|
+
|
|
1340
|
+
0:00:02.000,0:00:04.000
|
|
1341
|
+
Text line 2
|
|
1342
|
+
|
|
1343
|
+
Args:
|
|
1344
|
+
alignments: List of supervision segments to write
|
|
1345
|
+
output_path: Path to output SBV file
|
|
1346
|
+
include_speaker_in_text: Whether to include speaker in text
|
|
1347
|
+
"""
|
|
1348
|
+
with open(output_path, "w", encoding="utf-8") as file:
|
|
1349
|
+
for i, supervision in enumerate(alignments):
|
|
1350
|
+
# Format timestamps as H:MM:SS.mmm
|
|
1351
|
+
start_h = int(supervision.start // 3600)
|
|
1352
|
+
start_m = int((supervision.start % 3600) // 60)
|
|
1353
|
+
start_s = int(supervision.start % 60)
|
|
1354
|
+
start_ms = int((supervision.start % 1) * 1000)
|
|
1355
|
+
|
|
1356
|
+
end_h = int(supervision.end // 3600)
|
|
1357
|
+
end_m = int((supervision.end % 3600) // 60)
|
|
1358
|
+
end_s = int(supervision.end % 60)
|
|
1359
|
+
end_ms = int((supervision.end % 1) * 1000)
|
|
1360
|
+
|
|
1361
|
+
start_time = f"{start_h}:{start_m:02d}:{start_s:02d}.{start_ms:03d}"
|
|
1362
|
+
end_time = f"{end_h}:{end_m:02d}:{end_s:02d}.{end_ms:03d}"
|
|
1363
|
+
|
|
1364
|
+
# Write timestamp line
|
|
1365
|
+
file.write(f"{start_time},{end_time}\n")
|
|
1366
|
+
|
|
1367
|
+
# Write text (with optional speaker)
|
|
1368
|
+
text = supervision.text.strip()
|
|
1369
|
+
if include_speaker_in_text and supervision.speaker:
|
|
1370
|
+
text = f"{supervision.speaker}: {text}"
|
|
1371
|
+
|
|
1372
|
+
file.write(f"{text}\n")
|
|
1373
|
+
|
|
1374
|
+
# Add blank line between entries (except after last one)
|
|
1375
|
+
if i < len(alignments) - 1:
|
|
1376
|
+
file.write("\n")
|
|
1377
|
+
|
|
1220
1378
|
@classmethod
|
|
1221
1379
|
def _parse_caption(
|
|
1222
1380
|
cls, caption: Pathlike, format: Optional[OutputCaptionFormat], normalize_text: Optional[bool] = False
|
lattifai/cli/alignment.py
CHANGED
|
@@ -81,7 +81,7 @@ def align(
|
|
|
81
81
|
caption.word_level=true \\
|
|
82
82
|
caption.normalize_text=true \\
|
|
83
83
|
alignment.device=mps \\
|
|
84
|
-
alignment.model_name=
|
|
84
|
+
alignment.model_name=LattifAI/Lattice-1-Alpha
|
|
85
85
|
"""
|
|
86
86
|
media_config = media or MediaConfig()
|
|
87
87
|
|
|
@@ -142,6 +142,7 @@ def align(
|
|
|
142
142
|
output_caption_path=caption_config.output_path,
|
|
143
143
|
split_sentence=caption_config.split_sentence,
|
|
144
144
|
channel_selector=media_config.channel_selector,
|
|
145
|
+
streaming_chunk_secs=media_config.streaming_chunk_secs,
|
|
145
146
|
)
|
|
146
147
|
|
|
147
148
|
|