lattifai 1.2.2__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/_init.py +20 -0
- lattifai/alignment/__init__.py +2 -3
- lattifai/alignment/lattice1_aligner.py +117 -4
- lattifai/alignment/lattice1_worker.py +47 -4
- lattifai/alignment/segmenter.py +3 -2
- lattifai/alignment/text_align.py +2 -1
- lattifai/alignment/tokenizer.py +56 -29
- lattifai/audio2.py +162 -183
- lattifai/cli/alignment.py +5 -0
- lattifai/cli/caption.py +6 -6
- lattifai/cli/transcribe.py +1 -5
- lattifai/cli/youtube.py +3 -0
- lattifai/client.py +41 -12
- lattifai/config/__init__.py +21 -3
- lattifai/config/alignment.py +7 -0
- lattifai/config/caption.py +13 -243
- lattifai/config/client.py +16 -0
- lattifai/config/event.py +102 -0
- lattifai/config/transcription.py +25 -1
- lattifai/data/__init__.py +8 -0
- lattifai/data/caption.py +228 -0
- lattifai/errors.py +78 -53
- lattifai/event/__init__.py +65 -0
- lattifai/event/lattifai.py +166 -0
- lattifai/mixin.py +22 -17
- lattifai/transcription/base.py +2 -1
- lattifai/transcription/gemini.py +147 -16
- lattifai/transcription/lattifai.py +8 -11
- lattifai/types.py +1 -1
- lattifai/youtube/client.py +143 -48
- {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/METADATA +129 -58
- lattifai-1.3.1.dist-info/RECORD +57 -0
- lattifai/__init__.py +0 -88
- lattifai/alignment/sentence_splitter.py +0 -350
- lattifai/caption/__init__.py +0 -96
- lattifai/caption/caption.py +0 -661
- lattifai/caption/formats/__init__.py +0 -199
- lattifai/caption/formats/base.py +0 -211
- lattifai/caption/formats/gemini.py +0 -722
- lattifai/caption/formats/json.py +0 -194
- lattifai/caption/formats/lrc.py +0 -309
- lattifai/caption/formats/nle/__init__.py +0 -9
- lattifai/caption/formats/nle/audition.py +0 -561
- lattifai/caption/formats/nle/avid.py +0 -423
- lattifai/caption/formats/nle/fcpxml.py +0 -549
- lattifai/caption/formats/nle/premiere.py +0 -589
- lattifai/caption/formats/pysubs2.py +0 -642
- lattifai/caption/formats/sbv.py +0 -147
- lattifai/caption/formats/tabular.py +0 -338
- lattifai/caption/formats/textgrid.py +0 -193
- lattifai/caption/formats/ttml.py +0 -652
- lattifai/caption/formats/vtt.py +0 -469
- lattifai/caption/parsers/__init__.py +0 -9
- lattifai/caption/parsers/text_parser.py +0 -147
- lattifai/caption/standardize.py +0 -636
- lattifai/caption/supervision.py +0 -34
- lattifai/caption/utils.py +0 -474
- lattifai-1.2.2.dist-info/RECORD +0 -76
- {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/WHEEL +0 -0
- {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/entry_points.txt +0 -0
- {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/top_level.txt +0 -0
lattifai/audio2.py
CHANGED
|
@@ -162,168 +162,132 @@ class AudioLoader:
|
|
|
162
162
|
sampling_rate: int,
|
|
163
163
|
channel_selector: Optional[ChannelSelectorType],
|
|
164
164
|
) -> np.ndarray:
|
|
165
|
-
"""Load audio from file or binary stream and resample to target rate.
|
|
165
|
+
"""Load audio from file or binary stream and resample to target rate."""
|
|
166
|
+
audio_source: Union[str, BinaryIO] = audio
|
|
167
|
+
audio_path: Optional[Path] = None
|
|
166
168
|
|
|
167
|
-
Args:
|
|
168
|
-
audio: Path to audio file or binary stream.
|
|
169
|
-
sampling_rate: Target sampling rate.
|
|
170
|
-
channel_selector: How to select channels.
|
|
171
|
-
|
|
172
|
-
Returns:
|
|
173
|
-
Resampled audio as a NumPy array of shape (channels, samples).
|
|
174
|
-
|
|
175
|
-
Raises:
|
|
176
|
-
ImportError: If PyAV is needed but not installed.
|
|
177
|
-
ValueError: If no audio stream found.
|
|
178
|
-
RuntimeError: If audio loading fails.
|
|
179
|
-
"""
|
|
180
169
|
if isinstance(audio, Pathlike):
|
|
181
|
-
|
|
170
|
+
audio_path = Path(str(audio)).expanduser()
|
|
171
|
+
audio_source = str(audio_path)
|
|
172
|
+
|
|
173
|
+
if audio_path and audio_path.suffix.lower() in [".mp4", ".m4a", ".aac", ".mov", ".webm", ".avi", ".mkv"]:
|
|
174
|
+
return self._load_audio_with_av(audio_source, sampling_rate, channel_selector)
|
|
182
175
|
|
|
183
|
-
# load audio in chunks to reduce memory footprint for long files
|
|
184
176
|
try:
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
177
|
+
return self._load_audio_with_soundfile(audio_source, sampling_rate, channel_selector)
|
|
178
|
+
except Exception as primary_error:
|
|
179
|
+
print(f"Primary error with soundfile: {primary_error}")
|
|
180
|
+
return self._load_audio_with_av(audio_source, sampling_rate, channel_selector, primary_error)
|
|
188
181
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
182
|
+
def _load_audio_with_soundfile(
|
|
183
|
+
self,
|
|
184
|
+
audio: Union[str, BinaryIO],
|
|
185
|
+
sampling_rate: int,
|
|
186
|
+
channel_selector: Optional[ChannelSelectorType],
|
|
187
|
+
) -> np.ndarray:
|
|
188
|
+
"""Load audio via soundfile with chunking support for long inputs."""
|
|
189
|
+
info = sf.info(audio)
|
|
190
|
+
duration = info.duration
|
|
194
191
|
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
192
|
+
if duration > 3600:
|
|
193
|
+
with sf.SoundFile(audio, "r") as f:
|
|
194
|
+
sample_rate = f.samplerate
|
|
195
|
+
total_frames = f.frames
|
|
198
196
|
|
|
199
|
-
|
|
200
|
-
|
|
197
|
+
num_channels = 1 if channel_selector else f.channels
|
|
198
|
+
expected_output_samples = int(total_frames * sampling_rate / sample_rate)
|
|
199
|
+
waveform = np.zeros((num_channels, expected_output_samples), dtype=np.float32)
|
|
201
200
|
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
output_offset = 0
|
|
201
|
+
chunk_frames = int(sample_rate * 1800)
|
|
202
|
+
output_offset = 0
|
|
205
203
|
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
204
|
+
while True:
|
|
205
|
+
chunk = f.read(frames=chunk_frames, dtype="float32", always_2d=True)
|
|
206
|
+
if chunk.size == 0:
|
|
207
|
+
break
|
|
210
208
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
)
|
|
209
|
+
resampled_chunk = self._resample_audio(
|
|
210
|
+
(chunk, sample_rate),
|
|
211
|
+
sampling_rate,
|
|
212
|
+
device=self.device,
|
|
213
|
+
channel_selector=channel_selector,
|
|
214
|
+
)
|
|
218
215
|
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
output_offset += chunk_length
|
|
216
|
+
chunk_length = resampled_chunk.shape[-1]
|
|
217
|
+
waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
|
|
218
|
+
output_offset += chunk_length
|
|
223
219
|
|
|
224
|
-
|
|
225
|
-
|
|
220
|
+
del chunk, resampled_chunk
|
|
221
|
+
|
|
222
|
+
if output_offset < expected_output_samples:
|
|
223
|
+
waveform = waveform[..., :output_offset]
|
|
226
224
|
|
|
227
|
-
|
|
228
|
-
if output_offset < expected_output_samples:
|
|
229
|
-
waveform = waveform[..., :output_offset]
|
|
225
|
+
return waveform
|
|
230
226
|
|
|
231
|
-
|
|
227
|
+
waveform, sample_rate = sf.read(audio, always_2d=True, dtype="float32")
|
|
228
|
+
result = self._resample_audio(
|
|
229
|
+
(waveform, sample_rate),
|
|
230
|
+
sampling_rate,
|
|
231
|
+
device=self.device,
|
|
232
|
+
channel_selector=channel_selector,
|
|
233
|
+
)
|
|
234
|
+
del waveform
|
|
235
|
+
return result
|
|
236
|
+
|
|
237
|
+
def _load_audio_with_av(
|
|
238
|
+
self,
|
|
239
|
+
audio: Union[str, BinaryIO],
|
|
240
|
+
sampling_rate: int,
|
|
241
|
+
channel_selector: Optional[ChannelSelectorType],
|
|
242
|
+
primary_error: Optional[Exception] = None,
|
|
243
|
+
) -> np.ndarray:
|
|
244
|
+
"""Load audio via PyAV when soundfile is unavailable or unsuitable."""
|
|
245
|
+
try:
|
|
246
|
+
import av
|
|
247
|
+
except ImportError as exc: # pragma: no cover
|
|
248
|
+
message = "PyAV (av) is required for loading certain audio formats. Install it with: pip install av"
|
|
249
|
+
if primary_error:
|
|
250
|
+
message = f"{message}\nPrimary error was: {primary_error}"
|
|
251
|
+
raise AudioLoadError(message) from exc
|
|
252
|
+
|
|
253
|
+
try:
|
|
254
|
+
container = av.open(audio)
|
|
255
|
+
audio_stream = next((s for s in container.streams if s.type == "audio"), None)
|
|
256
|
+
|
|
257
|
+
if audio_stream is None:
|
|
258
|
+
raise ValueError(f"No audio stream found in file: {audio}")
|
|
259
|
+
|
|
260
|
+
audio_stream.codec_context.format = av.AudioFormat("flt")
|
|
261
|
+
sample_rate = audio_stream.codec_context.sample_rate
|
|
262
|
+
|
|
263
|
+
duration_estimate = None
|
|
264
|
+
if audio_stream.duration and audio_stream.time_base:
|
|
265
|
+
duration_estimate = float(audio_stream.duration * audio_stream.time_base)
|
|
232
266
|
else:
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
container = av.open(audio)
|
|
258
|
-
audio_stream = next((s for s in container.streams if s.type == "audio"), None)
|
|
259
|
-
|
|
260
|
-
if audio_stream is None:
|
|
261
|
-
raise ValueError(f"No audio stream found in file: {audio}")
|
|
262
|
-
|
|
263
|
-
audio_stream.codec_context.format = av.AudioFormat("flt") # 32-bit float
|
|
264
|
-
sample_rate = audio_stream.codec_context.sample_rate
|
|
265
|
-
|
|
266
|
-
# Estimate duration to decide processing strategy
|
|
267
|
-
duration_estimate = None
|
|
268
|
-
if audio_stream.duration and audio_stream.time_base:
|
|
269
|
-
duration_estimate = float(audio_stream.duration * audio_stream.time_base)
|
|
270
|
-
else:
|
|
271
|
-
print(f"WARNING: Failed to estimate duration for audio: {audio}")
|
|
272
|
-
|
|
273
|
-
# For very long audio (>30 minutes), process and resample in chunks
|
|
274
|
-
if duration_estimate and duration_estimate > 1800:
|
|
275
|
-
# Estimate output size and pre-allocate with buffer
|
|
276
|
-
num_channels = 1 if channel_selector else audio_stream.codec_context.channels
|
|
277
|
-
estimated_samples = int(duration_estimate * sampling_rate * 1.1) # 10% buffer
|
|
278
|
-
waveform = np.zeros((num_channels, estimated_samples), dtype=np.float32)
|
|
279
|
-
|
|
280
|
-
frames = []
|
|
281
|
-
accumulated_samples = 0
|
|
282
|
-
output_offset = 0
|
|
283
|
-
chunk_sample_target = int(sample_rate * 600) # 10 minutes at original rate
|
|
284
|
-
|
|
285
|
-
for frame in container.decode(audio_stream):
|
|
286
|
-
array = frame.to_ndarray()
|
|
287
|
-
|
|
288
|
-
# Ensure shape is (samples, channels)
|
|
289
|
-
if array.ndim == 1:
|
|
290
|
-
array = array.reshape(-1, 1)
|
|
291
|
-
elif array.ndim == 2 and array.shape[0] < array.shape[1]:
|
|
292
|
-
array = array.T
|
|
293
|
-
|
|
294
|
-
frames.append(array)
|
|
295
|
-
accumulated_samples += array.shape[0]
|
|
296
|
-
|
|
297
|
-
# Process chunk when accumulated enough samples
|
|
298
|
-
if accumulated_samples >= chunk_sample_target:
|
|
299
|
-
chunk = np.concatenate(frames, axis=0).astype(np.float32)
|
|
300
|
-
del frames # Free frames list before resampling
|
|
301
|
-
# Resample chunk -> (channels, samples)
|
|
302
|
-
resampled_chunk = self._resample_audio(
|
|
303
|
-
(chunk, sample_rate),
|
|
304
|
-
sampling_rate,
|
|
305
|
-
device=self.device,
|
|
306
|
-
channel_selector=channel_selector,
|
|
307
|
-
)
|
|
308
|
-
|
|
309
|
-
chunk_length = resampled_chunk.shape[-1]
|
|
310
|
-
if output_offset + chunk_length > waveform.shape[-1]:
|
|
311
|
-
print(
|
|
312
|
-
f"WARNING: Trimming resampled chunk from {chunk_length} to {waveform.shape[-1] - output_offset} samples to fit waveform buffer for audio: {audio}" # noqa: E501
|
|
313
|
-
)
|
|
314
|
-
resampled_chunk = resampled_chunk[:, : waveform.shape[-1] - output_offset]
|
|
315
|
-
|
|
316
|
-
# Write directly to array
|
|
317
|
-
waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
|
|
318
|
-
output_offset += chunk_length
|
|
319
|
-
|
|
320
|
-
# Clean up immediately
|
|
321
|
-
del chunk, resampled_chunk
|
|
322
|
-
frames = [] # Create new list
|
|
323
|
-
accumulated_samples = 0
|
|
324
|
-
|
|
325
|
-
# Process remaining frames
|
|
326
|
-
if frames:
|
|
267
|
+
print(f"WARNING: Failed to estimate duration for audio: {audio}")
|
|
268
|
+
|
|
269
|
+
if duration_estimate and duration_estimate > 1800:
|
|
270
|
+
num_channels = 1 if channel_selector else audio_stream.codec_context.channels
|
|
271
|
+
estimated_samples = int(duration_estimate * sampling_rate * 1.1)
|
|
272
|
+
waveform = np.zeros((num_channels, estimated_samples), dtype=np.float32)
|
|
273
|
+
|
|
274
|
+
frames = []
|
|
275
|
+
accumulated_samples = 0
|
|
276
|
+
output_offset = 0
|
|
277
|
+
chunk_sample_target = int(sample_rate * 600)
|
|
278
|
+
|
|
279
|
+
for frame in container.decode(audio_stream):
|
|
280
|
+
array = frame.to_ndarray()
|
|
281
|
+
|
|
282
|
+
if array.ndim == 1:
|
|
283
|
+
array = array.reshape(-1, 1)
|
|
284
|
+
elif array.ndim == 2 and array.shape[0] < array.shape[1]:
|
|
285
|
+
array = array.T
|
|
286
|
+
|
|
287
|
+
frames.append(array)
|
|
288
|
+
accumulated_samples += array.shape[0]
|
|
289
|
+
|
|
290
|
+
if accumulated_samples >= chunk_sample_target:
|
|
327
291
|
chunk = np.concatenate(frames, axis=0).astype(np.float32)
|
|
328
292
|
del frames
|
|
329
293
|
resampled_chunk = self._resample_audio(
|
|
@@ -335,53 +299,68 @@ class AudioLoader:
|
|
|
335
299
|
|
|
336
300
|
chunk_length = resampled_chunk.shape[-1]
|
|
337
301
|
if output_offset + chunk_length > waveform.shape[-1]:
|
|
338
|
-
print(
|
|
339
|
-
f"WARNING: Trimming resampled chunk from {chunk_length} to {waveform.shape[-1] - output_offset} samples to fit waveform buffer for audio: {audio}" # noqa: E501
|
|
340
|
-
)
|
|
302
|
+
print("WARNING: Trimming resampled chunk to fit waveform buffer for audio: " f"{audio}")
|
|
341
303
|
resampled_chunk = resampled_chunk[:, : waveform.shape[-1] - output_offset]
|
|
342
304
|
|
|
343
305
|
waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
|
|
344
306
|
output_offset += chunk_length
|
|
345
|
-
del chunk, resampled_chunk
|
|
346
|
-
|
|
347
|
-
container.close()
|
|
348
307
|
|
|
349
|
-
|
|
350
|
-
|
|
308
|
+
del chunk, resampled_chunk
|
|
309
|
+
frames = []
|
|
310
|
+
accumulated_samples = 0
|
|
351
311
|
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
return waveform
|
|
355
|
-
else:
|
|
356
|
-
# For shorter audio, process in batches to reduce memory
|
|
357
|
-
frames = []
|
|
358
|
-
for frame in container.decode(audio_stream):
|
|
359
|
-
array = frame.to_ndarray()
|
|
360
|
-
# Ensure shape is (channels, samples)
|
|
361
|
-
if array.ndim == 1:
|
|
362
|
-
array = array.reshape(-1, 1)
|
|
363
|
-
elif array.ndim == 2 and array.shape[0] < array.shape[1]:
|
|
364
|
-
array = array.T
|
|
365
|
-
frames.append(array)
|
|
366
|
-
container.close()
|
|
367
|
-
|
|
368
|
-
if not frames:
|
|
369
|
-
raise ValueError(f"No audio data found in file: {audio}")
|
|
370
|
-
|
|
371
|
-
# Concatenate remaining frames
|
|
372
|
-
waveform = np.concatenate(frames, axis=0).astype(np.float32)
|
|
312
|
+
if frames:
|
|
313
|
+
chunk = np.concatenate(frames, axis=0).astype(np.float32)
|
|
373
314
|
del frames
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
(waveform, sample_rate),
|
|
315
|
+
resampled_chunk = self._resample_audio(
|
|
316
|
+
(chunk, sample_rate),
|
|
377
317
|
sampling_rate,
|
|
378
318
|
device=self.device,
|
|
379
319
|
channel_selector=channel_selector,
|
|
380
320
|
)
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
321
|
+
|
|
322
|
+
chunk_length = resampled_chunk.shape[-1]
|
|
323
|
+
if output_offset + chunk_length > waveform.shape[-1]:
|
|
324
|
+
print("WARNING: Trimming resampled chunk to fit waveform buffer for audio: " f"{audio}")
|
|
325
|
+
resampled_chunk = resampled_chunk[:, : waveform.shape[-1] - output_offset]
|
|
326
|
+
|
|
327
|
+
waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
|
|
328
|
+
output_offset += chunk_length
|
|
329
|
+
del chunk, resampled_chunk
|
|
330
|
+
|
|
331
|
+
container.close()
|
|
332
|
+
|
|
333
|
+
if output_offset == 0:
|
|
334
|
+
raise ValueError(f"No audio data found in file: {audio}")
|
|
335
|
+
|
|
336
|
+
waveform = waveform[..., :output_offset]
|
|
337
|
+
return waveform
|
|
338
|
+
|
|
339
|
+
frames = []
|
|
340
|
+
for frame in container.decode(audio_stream):
|
|
341
|
+
array = frame.to_ndarray()
|
|
342
|
+
if array.ndim == 1:
|
|
343
|
+
array = array.reshape(-1, 1)
|
|
344
|
+
elif array.ndim == 2 and array.shape[0] < array.shape[1]:
|
|
345
|
+
array = array.T
|
|
346
|
+
frames.append(array)
|
|
347
|
+
container.close()
|
|
348
|
+
|
|
349
|
+
if not frames:
|
|
350
|
+
raise ValueError(f"No audio data found in file: {audio}")
|
|
351
|
+
|
|
352
|
+
waveform = np.concatenate(frames, axis=0).astype(np.float32)
|
|
353
|
+
del frames
|
|
354
|
+
result = self._resample_audio(
|
|
355
|
+
(waveform, sample_rate),
|
|
356
|
+
sampling_rate,
|
|
357
|
+
device=self.device,
|
|
358
|
+
channel_selector=channel_selector,
|
|
359
|
+
)
|
|
360
|
+
del waveform
|
|
361
|
+
return result
|
|
362
|
+
except Exception as exc:
|
|
363
|
+
raise RuntimeError(f"Failed to load audio file {audio}: {exc}")
|
|
385
364
|
|
|
386
365
|
def __call__(
|
|
387
366
|
self,
|
lattifai/cli/alignment.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Alignment CLI entry point with nemo_run."""
|
|
2
2
|
|
|
3
|
+
import sys
|
|
3
4
|
from typing import Optional
|
|
4
5
|
|
|
5
6
|
import nemo_run as run
|
|
@@ -12,9 +13,11 @@ from lattifai.config import (
|
|
|
12
13
|
CaptionConfig,
|
|
13
14
|
ClientConfig,
|
|
14
15
|
DiarizationConfig,
|
|
16
|
+
EventConfig,
|
|
15
17
|
MediaConfig,
|
|
16
18
|
TranscriptionConfig,
|
|
17
19
|
)
|
|
20
|
+
from lattifai.errors import LattifAIError
|
|
18
21
|
|
|
19
22
|
__all__ = ["align"]
|
|
20
23
|
|
|
@@ -30,6 +33,7 @@ def align(
|
|
|
30
33
|
alignment: Annotated[Optional[AlignmentConfig], run.Config[AlignmentConfig]] = None,
|
|
31
34
|
transcription: Annotated[Optional[TranscriptionConfig], run.Config[TranscriptionConfig]] = None,
|
|
32
35
|
diarization: Annotated[Optional[DiarizationConfig], run.Config[DiarizationConfig]] = None,
|
|
36
|
+
event: Annotated[Optional[EventConfig], run.Config[EventConfig]] = None,
|
|
33
37
|
):
|
|
34
38
|
"""
|
|
35
39
|
Align audio/video with caption file.
|
|
@@ -121,6 +125,7 @@ def align(
|
|
|
121
125
|
caption_config=caption_config,
|
|
122
126
|
transcription_config=transcription,
|
|
123
127
|
diarization_config=diarization,
|
|
128
|
+
event_config=event,
|
|
124
129
|
)
|
|
125
130
|
|
|
126
131
|
is_url = media_config.input_path.startswith(("http://", "https://"))
|
lattifai/cli/caption.py
CHANGED
|
@@ -6,8 +6,8 @@ import nemo_run as run
|
|
|
6
6
|
from lhotse.utils import Pathlike
|
|
7
7
|
from typing_extensions import Annotated
|
|
8
8
|
|
|
9
|
+
from lattifai.caption.config import KaraokeConfig
|
|
9
10
|
from lattifai.config import CaptionConfig
|
|
10
|
-
from lattifai.config.caption import KaraokeConfig
|
|
11
11
|
from lattifai.utils import safe_print
|
|
12
12
|
|
|
13
13
|
|
|
@@ -68,7 +68,7 @@ def convert(
|
|
|
68
68
|
input_path=input.srt \\
|
|
69
69
|
output_path=output.TextGrid
|
|
70
70
|
"""
|
|
71
|
-
from lattifai.
|
|
71
|
+
from lattifai.data import Caption
|
|
72
72
|
|
|
73
73
|
# Create karaoke_config if karaoke flag is set
|
|
74
74
|
karaoke_config = KaraokeConfig(enabled=True) if karaoke else None
|
|
@@ -121,7 +121,7 @@ def normalize(
|
|
|
121
121
|
"""
|
|
122
122
|
from pathlib import Path
|
|
123
123
|
|
|
124
|
-
from lattifai.
|
|
124
|
+
from lattifai.data import Caption
|
|
125
125
|
|
|
126
126
|
input_path = Path(input_path).expanduser()
|
|
127
127
|
output_path = Path(output_path).expanduser()
|
|
@@ -176,7 +176,7 @@ def shift(
|
|
|
176
176
|
"""
|
|
177
177
|
from pathlib import Path
|
|
178
178
|
|
|
179
|
-
from lattifai.
|
|
179
|
+
from lattifai.data import Caption
|
|
180
180
|
|
|
181
181
|
input_path = Path(input_path).expanduser()
|
|
182
182
|
output_path = Path(output_path).expanduser()
|
|
@@ -235,9 +235,9 @@ def diff(
|
|
|
235
235
|
"""
|
|
236
236
|
from pathlib import Path
|
|
237
237
|
|
|
238
|
-
from lattifai.alignment.sentence_splitter import SentenceSplitter
|
|
239
238
|
from lattifai.alignment.text_align import align_supervisions_and_transcription
|
|
240
|
-
from lattifai.caption import
|
|
239
|
+
from lattifai.caption import SentenceSplitter
|
|
240
|
+
from lattifai.data import Caption
|
|
241
241
|
|
|
242
242
|
ref_path = Path(ref_path).expanduser()
|
|
243
243
|
hyp_path = Path(hyp_path).expanduser()
|
lattifai/cli/transcribe.py
CHANGED
|
@@ -92,10 +92,6 @@ def transcribe(
|
|
|
92
92
|
client_wrapper = SyncAPIClient(config=client_config)
|
|
93
93
|
transcription_config.client_wrapper = client_wrapper
|
|
94
94
|
|
|
95
|
-
# Initialize client wrapper to properly set client_wrapper
|
|
96
|
-
client_wrapper = SyncAPIClient(config=client_config)
|
|
97
|
-
transcription_config.client_wrapper = client_wrapper
|
|
98
|
-
|
|
99
95
|
# Validate input is required
|
|
100
96
|
if not input and not media_config.input_path:
|
|
101
97
|
raise ValueError("Input is required. Provide input as positional argument or media.input_path.")
|
|
@@ -170,7 +166,7 @@ def transcribe(
|
|
|
170
166
|
safe_print(colorful.cyan(f" Output: {final_output}"))
|
|
171
167
|
|
|
172
168
|
# Write output
|
|
173
|
-
transcriber.write(transcript, final_output, encoding="utf-8",
|
|
169
|
+
transcriber.write(transcript, final_output, encoding="utf-8", cache_event=False)
|
|
174
170
|
|
|
175
171
|
safe_print(colorful.green(f"🎉 Transcription completed: {final_output}"))
|
|
176
172
|
|
lattifai/cli/youtube.py
CHANGED
|
@@ -11,6 +11,7 @@ from lattifai.config import (
|
|
|
11
11
|
CaptionConfig,
|
|
12
12
|
ClientConfig,
|
|
13
13
|
DiarizationConfig,
|
|
14
|
+
EventConfig,
|
|
14
15
|
MediaConfig,
|
|
15
16
|
TranscriptionConfig,
|
|
16
17
|
)
|
|
@@ -25,6 +26,7 @@ def youtube(
|
|
|
25
26
|
caption: Annotated[Optional[CaptionConfig], run.Config[CaptionConfig]] = None,
|
|
26
27
|
transcription: Annotated[Optional[TranscriptionConfig], run.Config[TranscriptionConfig]] = None,
|
|
27
28
|
diarization: Annotated[Optional[DiarizationConfig], run.Config[DiarizationConfig]] = None,
|
|
29
|
+
event: Annotated[Optional[EventConfig], run.Config[EventConfig]] = None,
|
|
28
30
|
use_transcription: bool = False,
|
|
29
31
|
):
|
|
30
32
|
"""
|
|
@@ -114,6 +116,7 @@ def youtube(
|
|
|
114
116
|
caption_config=caption_config,
|
|
115
117
|
transcription_config=transcription,
|
|
116
118
|
diarization_config=diarization,
|
|
119
|
+
event_config=event,
|
|
117
120
|
)
|
|
118
121
|
|
|
119
122
|
# Call the client's youtube method
|
lattifai/client.py
CHANGED
|
@@ -7,10 +7,18 @@ import colorful
|
|
|
7
7
|
from lattifai_core.client import SyncAPIClient
|
|
8
8
|
from lhotse.utils import Pathlike
|
|
9
9
|
|
|
10
|
-
from lattifai.alignment import Lattice1Aligner, Segmenter
|
|
10
|
+
from lattifai.alignment import Lattice1Aligner, Segmenter
|
|
11
11
|
from lattifai.audio2 import AudioData, AudioLoader
|
|
12
|
-
from lattifai.caption import
|
|
13
|
-
from lattifai.config import
|
|
12
|
+
from lattifai.caption import InputCaptionFormat
|
|
13
|
+
from lattifai.config import (
|
|
14
|
+
AlignmentConfig,
|
|
15
|
+
CaptionConfig,
|
|
16
|
+
ClientConfig,
|
|
17
|
+
DiarizationConfig,
|
|
18
|
+
EventConfig,
|
|
19
|
+
TranscriptionConfig,
|
|
20
|
+
)
|
|
21
|
+
from lattifai.data import Caption
|
|
14
22
|
from lattifai.errors import (
|
|
15
23
|
AlignmentError,
|
|
16
24
|
CaptionProcessingError,
|
|
@@ -22,6 +30,7 @@ from lattifai.utils import safe_print
|
|
|
22
30
|
|
|
23
31
|
if TYPE_CHECKING:
|
|
24
32
|
from lattifai.diarization import LattifAIDiarizer # noqa: F401
|
|
33
|
+
from lattifai.event import LattifAIEventDetector # noqa: F401
|
|
25
34
|
|
|
26
35
|
|
|
27
36
|
class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
@@ -41,6 +50,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
41
50
|
caption_config: Optional[CaptionConfig] = None,
|
|
42
51
|
transcription_config: Optional[TranscriptionConfig] = None,
|
|
43
52
|
diarization_config: Optional[DiarizationConfig] = None,
|
|
53
|
+
event_config: Optional[EventConfig] = None,
|
|
44
54
|
) -> None:
|
|
45
55
|
__doc__ = LattifAIClientMixin._INIT_DOC.format(
|
|
46
56
|
client_class="LattifAI",
|
|
@@ -59,8 +69,8 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
59
69
|
self.config = client_config
|
|
60
70
|
|
|
61
71
|
# Initialize all configs with defaults
|
|
62
|
-
alignment_config, transcription_config, diarization_config = self._init_configs(
|
|
63
|
-
alignment_config, transcription_config, diarization_config
|
|
72
|
+
alignment_config, transcription_config, diarization_config, event_config = self._init_configs(
|
|
73
|
+
alignment_config, transcription_config, diarization_config, event_config
|
|
64
74
|
)
|
|
65
75
|
|
|
66
76
|
# Store configs
|
|
@@ -82,6 +92,14 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
82
92
|
|
|
83
93
|
self.diarizer = LattifAIDiarizer(config=self.diarization_config)
|
|
84
94
|
|
|
95
|
+
# Initialize event detector if enabled
|
|
96
|
+
self.event_config = event_config
|
|
97
|
+
self.event_detector: Optional["LattifAIEventDetector"] = None
|
|
98
|
+
if self.event_config.enabled:
|
|
99
|
+
from lattifai.event import LattifAIEventDetector # noqa: F811
|
|
100
|
+
|
|
101
|
+
self.event_detector = LattifAIEventDetector(config=self.event_config)
|
|
102
|
+
|
|
85
103
|
# Initialize shared components (transcriber, downloader)
|
|
86
104
|
self._init_shared_components(transcription_config)
|
|
87
105
|
|
|
@@ -126,6 +144,8 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
126
144
|
safe_print(colorful.cyan(f"🔄 Using segmented alignment strategy: {alignment_strategy}"))
|
|
127
145
|
|
|
128
146
|
if caption.supervisions and alignment_strategy == "transcription":
|
|
147
|
+
from lattifai.alignment.text_align import align_supervisions_and_transcription
|
|
148
|
+
|
|
129
149
|
if "gemini" in self.transcriber.name.lower():
|
|
130
150
|
raise ValueError(
|
|
131
151
|
f"Transcription-based alignment is not supported for {self.transcriber.name} "
|
|
@@ -139,7 +159,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
139
159
|
output_dir=Path(str(output_caption_path)).parent if output_caption_path else None,
|
|
140
160
|
)
|
|
141
161
|
caption.transcription = transcript.supervisions or transcript.transcription
|
|
142
|
-
caption.
|
|
162
|
+
caption.event = transcript.event
|
|
143
163
|
if not caption.transcription:
|
|
144
164
|
raise ValueError("Transcription is empty after transcription step.")
|
|
145
165
|
|
|
@@ -234,13 +254,15 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
234
254
|
if self.config.profile:
|
|
235
255
|
self.aligner.profile()
|
|
236
256
|
|
|
237
|
-
except (CaptionProcessingError, LatticeEncodingError
|
|
257
|
+
except (CaptionProcessingError, LatticeEncodingError) as e:
|
|
238
258
|
# Re-raise our specific errors as-is
|
|
239
|
-
raise
|
|
259
|
+
raise e
|
|
260
|
+
except LatticeDecodingError as e:
|
|
261
|
+
raise e
|
|
240
262
|
except Exception as e:
|
|
241
263
|
# Catch any unexpected errors and wrap them
|
|
242
264
|
raise AlignmentError(
|
|
243
|
-
"Unexpected error during alignment process",
|
|
265
|
+
message="Unexpected error during alignment process",
|
|
244
266
|
media_path=str(input_media),
|
|
245
267
|
caption_path=str(input_caption),
|
|
246
268
|
context={"original_error": str(e), "error_type": e.__class__.__name__},
|
|
@@ -255,6 +277,13 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
255
277
|
output_caption_path=output_caption_path,
|
|
256
278
|
)
|
|
257
279
|
|
|
280
|
+
# Step 6: Event detection
|
|
281
|
+
if self.event_config.enabled and self.event_detector:
|
|
282
|
+
safe_print(colorful.cyan("🔊 Performing audio event detection..."))
|
|
283
|
+
caption = self.event_detector.detect_and_update_caption(caption, media_audio)
|
|
284
|
+
if output_caption_path:
|
|
285
|
+
self._write_caption(caption, output_caption_path)
|
|
286
|
+
|
|
258
287
|
return caption
|
|
259
288
|
|
|
260
289
|
def speaker_diarization(
|
|
@@ -285,12 +314,12 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
285
314
|
diarization_file = Path(str(output_caption_path)).with_suffix(".SpkDiar")
|
|
286
315
|
if diarization_file.exists():
|
|
287
316
|
safe_print(colorful.cyan(f"Reading existing speaker diarization from {diarization_file}"))
|
|
288
|
-
caption.
|
|
317
|
+
caption.read_diarization(diarization_file)
|
|
289
318
|
|
|
290
319
|
diarization, alignments = self.diarizer.diarize_with_alignments(
|
|
291
320
|
input_media,
|
|
292
321
|
caption.alignments,
|
|
293
|
-
diarization=caption.
|
|
322
|
+
diarization=caption.diarization,
|
|
294
323
|
alignment_fn=self.aligner.alignment,
|
|
295
324
|
transcribe_fn=self.transcriber.transcribe_numpy if self.transcriber else None,
|
|
296
325
|
separate_fn=self.aligner.separate if self.aligner.worker.separator_ort else None,
|
|
@@ -298,7 +327,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
298
327
|
output_path=output_caption_path,
|
|
299
328
|
)
|
|
300
329
|
caption.alignments = alignments
|
|
301
|
-
caption.
|
|
330
|
+
caption.diarization = diarization
|
|
302
331
|
|
|
303
332
|
# Write output if requested
|
|
304
333
|
if output_caption_path:
|