lattifai 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/_init.py +20 -0
- lattifai/alignment/__init__.py +9 -1
- lattifai/alignment/lattice1_aligner.py +175 -54
- lattifai/alignment/lattice1_worker.py +47 -4
- lattifai/alignment/punctuation.py +38 -0
- lattifai/alignment/segmenter.py +3 -2
- lattifai/alignment/text_align.py +441 -0
- lattifai/alignment/tokenizer.py +134 -65
- lattifai/audio2.py +162 -183
- lattifai/cli/__init__.py +2 -1
- lattifai/cli/alignment.py +5 -0
- lattifai/cli/caption.py +111 -4
- lattifai/cli/transcribe.py +2 -6
- lattifai/cli/youtube.py +7 -1
- lattifai/client.py +72 -123
- lattifai/config/__init__.py +28 -0
- lattifai/config/alignment.py +14 -0
- lattifai/config/caption.py +45 -31
- lattifai/config/client.py +16 -0
- lattifai/config/event.py +102 -0
- lattifai/config/media.py +20 -0
- lattifai/config/transcription.py +25 -1
- lattifai/data/__init__.py +8 -0
- lattifai/data/caption.py +228 -0
- lattifai/diarization/__init__.py +41 -1
- lattifai/errors.py +78 -53
- lattifai/event/__init__.py +65 -0
- lattifai/event/lattifai.py +166 -0
- lattifai/mixin.py +49 -32
- lattifai/transcription/base.py +8 -2
- lattifai/transcription/gemini.py +147 -16
- lattifai/transcription/lattifai.py +25 -63
- lattifai/types.py +1 -1
- lattifai/utils.py +7 -13
- lattifai/workflow/__init__.py +28 -4
- lattifai/workflow/file_manager.py +2 -5
- lattifai/youtube/__init__.py +43 -0
- lattifai/youtube/client.py +1265 -0
- lattifai/youtube/types.py +23 -0
- lattifai-1.3.0.dist-info/METADATA +678 -0
- lattifai-1.3.0.dist-info/RECORD +57 -0
- {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/entry_points.txt +1 -2
- lattifai/__init__.py +0 -88
- lattifai/alignment/sentence_splitter.py +0 -219
- lattifai/caption/__init__.py +0 -20
- lattifai/caption/caption.py +0 -1467
- lattifai/caption/gemini_reader.py +0 -462
- lattifai/caption/gemini_writer.py +0 -173
- lattifai/caption/supervision.py +0 -34
- lattifai/caption/text_parser.py +0 -145
- lattifai/cli/app_installer.py +0 -142
- lattifai/cli/server.py +0 -44
- lattifai/server/app.py +0 -427
- lattifai/workflow/youtube.py +0 -577
- lattifai-1.2.1.dist-info/METADATA +0 -1134
- lattifai-1.2.1.dist-info/RECORD +0 -58
- {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/WHEEL +0 -0
- {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/top_level.txt +0 -0
lattifai/audio2.py
CHANGED
|
@@ -162,168 +162,132 @@ class AudioLoader:
|
|
|
162
162
|
sampling_rate: int,
|
|
163
163
|
channel_selector: Optional[ChannelSelectorType],
|
|
164
164
|
) -> np.ndarray:
|
|
165
|
-
"""Load audio from file or binary stream and resample to target rate.
|
|
165
|
+
"""Load audio from file or binary stream and resample to target rate."""
|
|
166
|
+
audio_source: Union[str, BinaryIO] = audio
|
|
167
|
+
audio_path: Optional[Path] = None
|
|
166
168
|
|
|
167
|
-
Args:
|
|
168
|
-
audio: Path to audio file or binary stream.
|
|
169
|
-
sampling_rate: Target sampling rate.
|
|
170
|
-
channel_selector: How to select channels.
|
|
171
|
-
|
|
172
|
-
Returns:
|
|
173
|
-
Resampled audio as a NumPy array of shape (channels, samples).
|
|
174
|
-
|
|
175
|
-
Raises:
|
|
176
|
-
ImportError: If PyAV is needed but not installed.
|
|
177
|
-
ValueError: If no audio stream found.
|
|
178
|
-
RuntimeError: If audio loading fails.
|
|
179
|
-
"""
|
|
180
169
|
if isinstance(audio, Pathlike):
|
|
181
|
-
|
|
170
|
+
audio_path = Path(str(audio)).expanduser()
|
|
171
|
+
audio_source = str(audio_path)
|
|
172
|
+
|
|
173
|
+
if audio_path and audio_path.suffix.lower() in [".mp4", ".m4a", ".aac", ".mov", ".webm", ".avi", ".mkv"]:
|
|
174
|
+
return self._load_audio_with_av(audio_source, sampling_rate, channel_selector)
|
|
182
175
|
|
|
183
|
-
# load audio in chunks to reduce memory footprint for long files
|
|
184
176
|
try:
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
177
|
+
return self._load_audio_with_soundfile(audio_source, sampling_rate, channel_selector)
|
|
178
|
+
except Exception as primary_error:
|
|
179
|
+
print(f"Primary error with soundfile: {primary_error}")
|
|
180
|
+
return self._load_audio_with_av(audio_source, sampling_rate, channel_selector, primary_error)
|
|
188
181
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
182
|
+
def _load_audio_with_soundfile(
|
|
183
|
+
self,
|
|
184
|
+
audio: Union[str, BinaryIO],
|
|
185
|
+
sampling_rate: int,
|
|
186
|
+
channel_selector: Optional[ChannelSelectorType],
|
|
187
|
+
) -> np.ndarray:
|
|
188
|
+
"""Load audio via soundfile with chunking support for long inputs."""
|
|
189
|
+
info = sf.info(audio)
|
|
190
|
+
duration = info.duration
|
|
194
191
|
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
192
|
+
if duration > 3600:
|
|
193
|
+
with sf.SoundFile(audio, "r") as f:
|
|
194
|
+
sample_rate = f.samplerate
|
|
195
|
+
total_frames = f.frames
|
|
198
196
|
|
|
199
|
-
|
|
200
|
-
|
|
197
|
+
num_channels = 1 if channel_selector else f.channels
|
|
198
|
+
expected_output_samples = int(total_frames * sampling_rate / sample_rate)
|
|
199
|
+
waveform = np.zeros((num_channels, expected_output_samples), dtype=np.float32)
|
|
201
200
|
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
output_offset = 0
|
|
201
|
+
chunk_frames = int(sample_rate * 1800)
|
|
202
|
+
output_offset = 0
|
|
205
203
|
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
204
|
+
while True:
|
|
205
|
+
chunk = f.read(frames=chunk_frames, dtype="float32", always_2d=True)
|
|
206
|
+
if chunk.size == 0:
|
|
207
|
+
break
|
|
210
208
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
)
|
|
209
|
+
resampled_chunk = self._resample_audio(
|
|
210
|
+
(chunk, sample_rate),
|
|
211
|
+
sampling_rate,
|
|
212
|
+
device=self.device,
|
|
213
|
+
channel_selector=channel_selector,
|
|
214
|
+
)
|
|
218
215
|
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
output_offset += chunk_length
|
|
216
|
+
chunk_length = resampled_chunk.shape[-1]
|
|
217
|
+
waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
|
|
218
|
+
output_offset += chunk_length
|
|
223
219
|
|
|
224
|
-
|
|
225
|
-
|
|
220
|
+
del chunk, resampled_chunk
|
|
221
|
+
|
|
222
|
+
if output_offset < expected_output_samples:
|
|
223
|
+
waveform = waveform[..., :output_offset]
|
|
226
224
|
|
|
227
|
-
|
|
228
|
-
if output_offset < expected_output_samples:
|
|
229
|
-
waveform = waveform[..., :output_offset]
|
|
225
|
+
return waveform
|
|
230
226
|
|
|
231
|
-
|
|
227
|
+
waveform, sample_rate = sf.read(audio, always_2d=True, dtype="float32")
|
|
228
|
+
result = self._resample_audio(
|
|
229
|
+
(waveform, sample_rate),
|
|
230
|
+
sampling_rate,
|
|
231
|
+
device=self.device,
|
|
232
|
+
channel_selector=channel_selector,
|
|
233
|
+
)
|
|
234
|
+
del waveform
|
|
235
|
+
return result
|
|
236
|
+
|
|
237
|
+
def _load_audio_with_av(
|
|
238
|
+
self,
|
|
239
|
+
audio: Union[str, BinaryIO],
|
|
240
|
+
sampling_rate: int,
|
|
241
|
+
channel_selector: Optional[ChannelSelectorType],
|
|
242
|
+
primary_error: Optional[Exception] = None,
|
|
243
|
+
) -> np.ndarray:
|
|
244
|
+
"""Load audio via PyAV when soundfile is unavailable or unsuitable."""
|
|
245
|
+
try:
|
|
246
|
+
import av
|
|
247
|
+
except ImportError as exc: # pragma: no cover
|
|
248
|
+
message = "PyAV (av) is required for loading certain audio formats. Install it with: pip install av"
|
|
249
|
+
if primary_error:
|
|
250
|
+
message = f"{message}\nPrimary error was: {primary_error}"
|
|
251
|
+
raise AudioLoadError(message) from exc
|
|
252
|
+
|
|
253
|
+
try:
|
|
254
|
+
container = av.open(audio)
|
|
255
|
+
audio_stream = next((s for s in container.streams if s.type == "audio"), None)
|
|
256
|
+
|
|
257
|
+
if audio_stream is None:
|
|
258
|
+
raise ValueError(f"No audio stream found in file: {audio}")
|
|
259
|
+
|
|
260
|
+
audio_stream.codec_context.format = av.AudioFormat("flt")
|
|
261
|
+
sample_rate = audio_stream.codec_context.sample_rate
|
|
262
|
+
|
|
263
|
+
duration_estimate = None
|
|
264
|
+
if audio_stream.duration and audio_stream.time_base:
|
|
265
|
+
duration_estimate = float(audio_stream.duration * audio_stream.time_base)
|
|
232
266
|
else:
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
container = av.open(audio)
|
|
258
|
-
audio_stream = next((s for s in container.streams if s.type == "audio"), None)
|
|
259
|
-
|
|
260
|
-
if audio_stream is None:
|
|
261
|
-
raise ValueError(f"No audio stream found in file: {audio}")
|
|
262
|
-
|
|
263
|
-
audio_stream.codec_context.format = av.AudioFormat("flt") # 32-bit float
|
|
264
|
-
sample_rate = audio_stream.codec_context.sample_rate
|
|
265
|
-
|
|
266
|
-
# Estimate duration to decide processing strategy
|
|
267
|
-
duration_estimate = None
|
|
268
|
-
if audio_stream.duration and audio_stream.time_base:
|
|
269
|
-
duration_estimate = float(audio_stream.duration * audio_stream.time_base)
|
|
270
|
-
else:
|
|
271
|
-
print(f"WARNING: Failed to estimate duration for audio: {audio}")
|
|
272
|
-
|
|
273
|
-
# For very long audio (>30 minutes), process and resample in chunks
|
|
274
|
-
if duration_estimate and duration_estimate > 1800:
|
|
275
|
-
# Estimate output size and pre-allocate with buffer
|
|
276
|
-
num_channels = 1 if channel_selector else audio_stream.codec_context.channels
|
|
277
|
-
estimated_samples = int(duration_estimate * sampling_rate * 1.1) # 10% buffer
|
|
278
|
-
waveform = np.zeros((num_channels, estimated_samples), dtype=np.float32)
|
|
279
|
-
|
|
280
|
-
frames = []
|
|
281
|
-
accumulated_samples = 0
|
|
282
|
-
output_offset = 0
|
|
283
|
-
chunk_sample_target = int(sample_rate * 600) # 10 minutes at original rate
|
|
284
|
-
|
|
285
|
-
for frame in container.decode(audio_stream):
|
|
286
|
-
array = frame.to_ndarray()
|
|
287
|
-
|
|
288
|
-
# Ensure shape is (samples, channels)
|
|
289
|
-
if array.ndim == 1:
|
|
290
|
-
array = array.reshape(-1, 1)
|
|
291
|
-
elif array.ndim == 2 and array.shape[0] < array.shape[1]:
|
|
292
|
-
array = array.T
|
|
293
|
-
|
|
294
|
-
frames.append(array)
|
|
295
|
-
accumulated_samples += array.shape[0]
|
|
296
|
-
|
|
297
|
-
# Process chunk when accumulated enough samples
|
|
298
|
-
if accumulated_samples >= chunk_sample_target:
|
|
299
|
-
chunk = np.concatenate(frames, axis=0).astype(np.float32)
|
|
300
|
-
del frames # Free frames list before resampling
|
|
301
|
-
# Resample chunk -> (channels, samples)
|
|
302
|
-
resampled_chunk = self._resample_audio(
|
|
303
|
-
(chunk, sample_rate),
|
|
304
|
-
sampling_rate,
|
|
305
|
-
device=self.device,
|
|
306
|
-
channel_selector=channel_selector,
|
|
307
|
-
)
|
|
308
|
-
|
|
309
|
-
chunk_length = resampled_chunk.shape[-1]
|
|
310
|
-
if output_offset + chunk_length > waveform.shape[-1]:
|
|
311
|
-
print(
|
|
312
|
-
f"WARNING: Trimming resampled chunk from {chunk_length} to {waveform.shape[-1] - output_offset} samples to fit waveform buffer for audio: {audio}" # noqa: E501
|
|
313
|
-
)
|
|
314
|
-
resampled_chunk = resampled_chunk[:, : waveform.shape[-1] - output_offset]
|
|
315
|
-
|
|
316
|
-
# Write directly to array
|
|
317
|
-
waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
|
|
318
|
-
output_offset += chunk_length
|
|
319
|
-
|
|
320
|
-
# Clean up immediately
|
|
321
|
-
del chunk, resampled_chunk
|
|
322
|
-
frames = [] # Create new list
|
|
323
|
-
accumulated_samples = 0
|
|
324
|
-
|
|
325
|
-
# Process remaining frames
|
|
326
|
-
if frames:
|
|
267
|
+
print(f"WARNING: Failed to estimate duration for audio: {audio}")
|
|
268
|
+
|
|
269
|
+
if duration_estimate and duration_estimate > 1800:
|
|
270
|
+
num_channels = 1 if channel_selector else audio_stream.codec_context.channels
|
|
271
|
+
estimated_samples = int(duration_estimate * sampling_rate * 1.1)
|
|
272
|
+
waveform = np.zeros((num_channels, estimated_samples), dtype=np.float32)
|
|
273
|
+
|
|
274
|
+
frames = []
|
|
275
|
+
accumulated_samples = 0
|
|
276
|
+
output_offset = 0
|
|
277
|
+
chunk_sample_target = int(sample_rate * 600)
|
|
278
|
+
|
|
279
|
+
for frame in container.decode(audio_stream):
|
|
280
|
+
array = frame.to_ndarray()
|
|
281
|
+
|
|
282
|
+
if array.ndim == 1:
|
|
283
|
+
array = array.reshape(-1, 1)
|
|
284
|
+
elif array.ndim == 2 and array.shape[0] < array.shape[1]:
|
|
285
|
+
array = array.T
|
|
286
|
+
|
|
287
|
+
frames.append(array)
|
|
288
|
+
accumulated_samples += array.shape[0]
|
|
289
|
+
|
|
290
|
+
if accumulated_samples >= chunk_sample_target:
|
|
327
291
|
chunk = np.concatenate(frames, axis=0).astype(np.float32)
|
|
328
292
|
del frames
|
|
329
293
|
resampled_chunk = self._resample_audio(
|
|
@@ -335,53 +299,68 @@ class AudioLoader:
|
|
|
335
299
|
|
|
336
300
|
chunk_length = resampled_chunk.shape[-1]
|
|
337
301
|
if output_offset + chunk_length > waveform.shape[-1]:
|
|
338
|
-
print(
|
|
339
|
-
f"WARNING: Trimming resampled chunk from {chunk_length} to {waveform.shape[-1] - output_offset} samples to fit waveform buffer for audio: {audio}" # noqa: E501
|
|
340
|
-
)
|
|
302
|
+
print("WARNING: Trimming resampled chunk to fit waveform buffer for audio: " f"{audio}")
|
|
341
303
|
resampled_chunk = resampled_chunk[:, : waveform.shape[-1] - output_offset]
|
|
342
304
|
|
|
343
305
|
waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
|
|
344
306
|
output_offset += chunk_length
|
|
345
|
-
del chunk, resampled_chunk
|
|
346
|
-
|
|
347
|
-
container.close()
|
|
348
307
|
|
|
349
|
-
|
|
350
|
-
|
|
308
|
+
del chunk, resampled_chunk
|
|
309
|
+
frames = []
|
|
310
|
+
accumulated_samples = 0
|
|
351
311
|
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
return waveform
|
|
355
|
-
else:
|
|
356
|
-
# For shorter audio, process in batches to reduce memory
|
|
357
|
-
frames = []
|
|
358
|
-
for frame in container.decode(audio_stream):
|
|
359
|
-
array = frame.to_ndarray()
|
|
360
|
-
# Ensure shape is (channels, samples)
|
|
361
|
-
if array.ndim == 1:
|
|
362
|
-
array = array.reshape(-1, 1)
|
|
363
|
-
elif array.ndim == 2 and array.shape[0] < array.shape[1]:
|
|
364
|
-
array = array.T
|
|
365
|
-
frames.append(array)
|
|
366
|
-
container.close()
|
|
367
|
-
|
|
368
|
-
if not frames:
|
|
369
|
-
raise ValueError(f"No audio data found in file: {audio}")
|
|
370
|
-
|
|
371
|
-
# Concatenate remaining frames
|
|
372
|
-
waveform = np.concatenate(frames, axis=0).astype(np.float32)
|
|
312
|
+
if frames:
|
|
313
|
+
chunk = np.concatenate(frames, axis=0).astype(np.float32)
|
|
373
314
|
del frames
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
(waveform, sample_rate),
|
|
315
|
+
resampled_chunk = self._resample_audio(
|
|
316
|
+
(chunk, sample_rate),
|
|
377
317
|
sampling_rate,
|
|
378
318
|
device=self.device,
|
|
379
319
|
channel_selector=channel_selector,
|
|
380
320
|
)
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
321
|
+
|
|
322
|
+
chunk_length = resampled_chunk.shape[-1]
|
|
323
|
+
if output_offset + chunk_length > waveform.shape[-1]:
|
|
324
|
+
print("WARNING: Trimming resampled chunk to fit waveform buffer for audio: " f"{audio}")
|
|
325
|
+
resampled_chunk = resampled_chunk[:, : waveform.shape[-1] - output_offset]
|
|
326
|
+
|
|
327
|
+
waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
|
|
328
|
+
output_offset += chunk_length
|
|
329
|
+
del chunk, resampled_chunk
|
|
330
|
+
|
|
331
|
+
container.close()
|
|
332
|
+
|
|
333
|
+
if output_offset == 0:
|
|
334
|
+
raise ValueError(f"No audio data found in file: {audio}")
|
|
335
|
+
|
|
336
|
+
waveform = waveform[..., :output_offset]
|
|
337
|
+
return waveform
|
|
338
|
+
|
|
339
|
+
frames = []
|
|
340
|
+
for frame in container.decode(audio_stream):
|
|
341
|
+
array = frame.to_ndarray()
|
|
342
|
+
if array.ndim == 1:
|
|
343
|
+
array = array.reshape(-1, 1)
|
|
344
|
+
elif array.ndim == 2 and array.shape[0] < array.shape[1]:
|
|
345
|
+
array = array.T
|
|
346
|
+
frames.append(array)
|
|
347
|
+
container.close()
|
|
348
|
+
|
|
349
|
+
if not frames:
|
|
350
|
+
raise ValueError(f"No audio data found in file: {audio}")
|
|
351
|
+
|
|
352
|
+
waveform = np.concatenate(frames, axis=0).astype(np.float32)
|
|
353
|
+
del frames
|
|
354
|
+
result = self._resample_audio(
|
|
355
|
+
(waveform, sample_rate),
|
|
356
|
+
sampling_rate,
|
|
357
|
+
device=self.device,
|
|
358
|
+
channel_selector=channel_selector,
|
|
359
|
+
)
|
|
360
|
+
del waveform
|
|
361
|
+
return result
|
|
362
|
+
except Exception as exc:
|
|
363
|
+
raise RuntimeError(f"Failed to load audio file {audio}: {exc}")
|
|
385
364
|
|
|
386
365
|
def __call__(
|
|
387
366
|
self,
|
lattifai/cli/__init__.py
CHANGED
|
@@ -4,7 +4,7 @@ import nemo_run as run # noqa: F401
|
|
|
4
4
|
|
|
5
5
|
# Import and re-export entrypoints at package level so NeMo Run can find them
|
|
6
6
|
from lattifai.cli.alignment import align
|
|
7
|
-
from lattifai.cli.caption import convert
|
|
7
|
+
from lattifai.cli.caption import convert, diff
|
|
8
8
|
from lattifai.cli.diarization import diarize
|
|
9
9
|
from lattifai.cli.transcribe import transcribe, transcribe_align
|
|
10
10
|
from lattifai.cli.youtube import youtube
|
|
@@ -12,6 +12,7 @@ from lattifai.cli.youtube import youtube
|
|
|
12
12
|
__all__ = [
|
|
13
13
|
"align",
|
|
14
14
|
"convert",
|
|
15
|
+
"diff",
|
|
15
16
|
"diarize",
|
|
16
17
|
"transcribe",
|
|
17
18
|
"transcribe_align",
|
lattifai/cli/alignment.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Alignment CLI entry point with nemo_run."""
|
|
2
2
|
|
|
3
|
+
import sys
|
|
3
4
|
from typing import Optional
|
|
4
5
|
|
|
5
6
|
import nemo_run as run
|
|
@@ -12,9 +13,11 @@ from lattifai.config import (
|
|
|
12
13
|
CaptionConfig,
|
|
13
14
|
ClientConfig,
|
|
14
15
|
DiarizationConfig,
|
|
16
|
+
EventConfig,
|
|
15
17
|
MediaConfig,
|
|
16
18
|
TranscriptionConfig,
|
|
17
19
|
)
|
|
20
|
+
from lattifai.errors import LattifAIError
|
|
18
21
|
|
|
19
22
|
__all__ = ["align"]
|
|
20
23
|
|
|
@@ -30,6 +33,7 @@ def align(
|
|
|
30
33
|
alignment: Annotated[Optional[AlignmentConfig], run.Config[AlignmentConfig]] = None,
|
|
31
34
|
transcription: Annotated[Optional[TranscriptionConfig], run.Config[TranscriptionConfig]] = None,
|
|
32
35
|
diarization: Annotated[Optional[DiarizationConfig], run.Config[DiarizationConfig]] = None,
|
|
36
|
+
event: Annotated[Optional[EventConfig], run.Config[EventConfig]] = None,
|
|
33
37
|
):
|
|
34
38
|
"""
|
|
35
39
|
Align audio/video with caption file.
|
|
@@ -121,6 +125,7 @@ def align(
|
|
|
121
125
|
caption_config=caption_config,
|
|
122
126
|
transcription_config=transcription,
|
|
123
127
|
diarization_config=diarization,
|
|
128
|
+
event_config=event,
|
|
124
129
|
)
|
|
125
130
|
|
|
126
131
|
is_url = media_config.input_path.startswith(("http://", "https://"))
|
lattifai/cli/caption.py
CHANGED
|
@@ -6,6 +6,7 @@ import nemo_run as run
|
|
|
6
6
|
from lhotse.utils import Pathlike
|
|
7
7
|
from typing_extensions import Annotated
|
|
8
8
|
|
|
9
|
+
from lattifai.caption.config import KaraokeConfig
|
|
9
10
|
from lattifai.config import CaptionConfig
|
|
10
11
|
from lattifai.utils import safe_print
|
|
11
12
|
|
|
@@ -16,6 +17,8 @@ def convert(
|
|
|
16
17
|
output_path: Pathlike,
|
|
17
18
|
include_speaker_in_text: bool = False,
|
|
18
19
|
normalize_text: bool = False,
|
|
20
|
+
word_level: bool = False,
|
|
21
|
+
karaoke: bool = False,
|
|
19
22
|
):
|
|
20
23
|
"""
|
|
21
24
|
Convert caption file to another format.
|
|
@@ -33,6 +36,11 @@ def convert(
|
|
|
33
36
|
normalize_text: Whether to normalize caption text during conversion.
|
|
34
37
|
This applies text cleaning such as removing HTML tags, decoding entities,
|
|
35
38
|
collapsing whitespace, and standardizing punctuation.
|
|
39
|
+
word_level: Use word-level output format if supported.
|
|
40
|
+
When True without karaoke: outputs word-per-segment (each word as separate segment).
|
|
41
|
+
JSON format will include a 'words' field with word-level timestamps.
|
|
42
|
+
karaoke: Enable karaoke styling (requires word_level=True).
|
|
43
|
+
When True: outputs karaoke format (ASS \\kf tags, enhanced LRC, etc.).
|
|
36
44
|
|
|
37
45
|
Examples:
|
|
38
46
|
# Basic format conversion (positional arguments)
|
|
@@ -41,6 +49,15 @@ def convert(
|
|
|
41
49
|
# Convert with text normalization
|
|
42
50
|
lai caption convert input.srt output.json normalize_text=true
|
|
43
51
|
|
|
52
|
+
# Convert to word-per-segment output (if input has alignment)
|
|
53
|
+
lai caption convert input.json output.srt word_level=true
|
|
54
|
+
|
|
55
|
+
# Convert to karaoke format (ASS with \\kf tags)
|
|
56
|
+
lai caption convert input.json output.ass word_level=true karaoke=true
|
|
57
|
+
|
|
58
|
+
# Export JSON with word-level timestamps
|
|
59
|
+
lai caption convert input.srt output.json word_level=true
|
|
60
|
+
|
|
44
61
|
# Mixing positional and keyword arguments
|
|
45
62
|
lai caption convert input.srt output.vtt \\
|
|
46
63
|
include_speaker_in_text=false \\
|
|
@@ -51,10 +68,18 @@ def convert(
|
|
|
51
68
|
input_path=input.srt \\
|
|
52
69
|
output_path=output.TextGrid
|
|
53
70
|
"""
|
|
54
|
-
from lattifai.
|
|
71
|
+
from lattifai.data import Caption
|
|
72
|
+
|
|
73
|
+
# Create karaoke_config if karaoke flag is set
|
|
74
|
+
karaoke_config = KaraokeConfig(enabled=True) if karaoke else None
|
|
55
75
|
|
|
56
76
|
caption = Caption.read(input_path, normalize_text=normalize_text)
|
|
57
|
-
caption.write(
|
|
77
|
+
caption.write(
|
|
78
|
+
output_path,
|
|
79
|
+
include_speaker_in_text=include_speaker_in_text,
|
|
80
|
+
word_level=word_level,
|
|
81
|
+
karaoke_config=karaoke_config,
|
|
82
|
+
)
|
|
58
83
|
|
|
59
84
|
safe_print(f"✅ Converted {input_path} -> {output_path}")
|
|
60
85
|
return output_path
|
|
@@ -96,7 +121,7 @@ def normalize(
|
|
|
96
121
|
"""
|
|
97
122
|
from pathlib import Path
|
|
98
123
|
|
|
99
|
-
from lattifai.
|
|
124
|
+
from lattifai.data import Caption
|
|
100
125
|
|
|
101
126
|
input_path = Path(input_path).expanduser()
|
|
102
127
|
output_path = Path(output_path).expanduser()
|
|
@@ -151,7 +176,7 @@ def shift(
|
|
|
151
176
|
"""
|
|
152
177
|
from pathlib import Path
|
|
153
178
|
|
|
154
|
-
from lattifai.
|
|
179
|
+
from lattifai.data import Caption
|
|
155
180
|
|
|
156
181
|
input_path = Path(input_path).expanduser()
|
|
157
182
|
output_path = Path(output_path).expanduser()
|
|
@@ -178,6 +203,88 @@ def shift(
|
|
|
178
203
|
return output_path
|
|
179
204
|
|
|
180
205
|
|
|
206
|
+
@run.cli.entrypoint(name="diff", namespace="caption")
|
|
207
|
+
def diff(
|
|
208
|
+
ref_path: Pathlike,
|
|
209
|
+
hyp_path: Pathlike,
|
|
210
|
+
split_sentence: bool = True,
|
|
211
|
+
verbose: bool = True,
|
|
212
|
+
):
|
|
213
|
+
"""
|
|
214
|
+
Compare and align caption supervisions with transcription segments.
|
|
215
|
+
|
|
216
|
+
This command reads a reference caption file and a hypothesis file, then performs
|
|
217
|
+
text alignment to show how they match up. It's useful for comparing
|
|
218
|
+
original subtitles against ASR (Automatic Speech Recognition) results.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
ref_path: Path to reference caption file (ground truth)
|
|
222
|
+
hyp_path: Path to hypothesis file (e.g., ASR results)
|
|
223
|
+
split_sentence: Enable sentence splitting before alignment (default: True)
|
|
224
|
+
verbose: Enable verbose output to show detailed alignment info (default: True)
|
|
225
|
+
|
|
226
|
+
Examples:
|
|
227
|
+
# Compare reference with hypothesis (positional arguments)
|
|
228
|
+
lai caption diff subtitles.srt transcription.json
|
|
229
|
+
|
|
230
|
+
# Disable sentence splitting
|
|
231
|
+
lai caption diff subtitles.srt transcription.json split_sentence=false
|
|
232
|
+
|
|
233
|
+
# Disable verbose output
|
|
234
|
+
lai caption diff subtitles.srt transcription.json verbose=false
|
|
235
|
+
"""
|
|
236
|
+
from pathlib import Path
|
|
237
|
+
|
|
238
|
+
from lattifai.alignment.text_align import align_supervisions_and_transcription
|
|
239
|
+
from lattifai.caption import SentenceSplitter
|
|
240
|
+
from lattifai.data import Caption
|
|
241
|
+
|
|
242
|
+
ref_path = Path(ref_path).expanduser()
|
|
243
|
+
hyp_path = Path(hyp_path).expanduser()
|
|
244
|
+
|
|
245
|
+
# Read reference caption (supervisions)
|
|
246
|
+
caption_obj = Caption.read(ref_path)
|
|
247
|
+
|
|
248
|
+
# Read hypothesis
|
|
249
|
+
hyp_obj = Caption.read(hyp_path)
|
|
250
|
+
|
|
251
|
+
# Apply sentence splitting if enabled
|
|
252
|
+
if split_sentence:
|
|
253
|
+
splitter = SentenceSplitter(device="cpu", lazy_init=True)
|
|
254
|
+
caption_obj.supervisions = splitter.split_sentences(caption_obj.supervisions)
|
|
255
|
+
hyp_obj.supervisions = splitter.split_sentences(hyp_obj.supervisions)
|
|
256
|
+
|
|
257
|
+
# Set transcription on caption object
|
|
258
|
+
caption_obj.transcription = hyp_obj.supervisions
|
|
259
|
+
|
|
260
|
+
safe_print(f"📖 Reference: {len(caption_obj.supervisions)} segments from {ref_path}")
|
|
261
|
+
safe_print(f"🎤 Hypothesis: {len(caption_obj.transcription)} segments from {hyp_path}")
|
|
262
|
+
if split_sentence:
|
|
263
|
+
safe_print("✂️ Sentence splitting: enabled")
|
|
264
|
+
safe_print("")
|
|
265
|
+
|
|
266
|
+
# Perform alignment
|
|
267
|
+
results = align_supervisions_and_transcription(
|
|
268
|
+
caption=caption_obj,
|
|
269
|
+
verbose=verbose,
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
# # Print summary
|
|
273
|
+
# safe_print("")
|
|
274
|
+
# safe_print("=" * 72)
|
|
275
|
+
# safe_print(f"📊 Alignment Summary: {len(results)} groups")
|
|
276
|
+
# for idx, (sub_align, asr_align, quality, timestamp, typing) in enumerate(results):
|
|
277
|
+
# sub_count = len(sub_align) if sub_align else 0
|
|
278
|
+
# asr_count = len(asr_align) if asr_align else 0
|
|
279
|
+
# safe_print(f" Group {idx + 1}: ref={sub_count}, hyp={asr_count}, {quality.info}, typing={typing}")
|
|
280
|
+
|
|
281
|
+
return results
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def main_diff():
|
|
285
|
+
run.cli.main(diff)
|
|
286
|
+
|
|
287
|
+
|
|
181
288
|
def main_convert():
|
|
182
289
|
run.cli.main(convert)
|
|
183
290
|
|
lattifai/cli/transcribe.py
CHANGED
|
@@ -92,10 +92,6 @@ def transcribe(
|
|
|
92
92
|
client_wrapper = SyncAPIClient(config=client_config)
|
|
93
93
|
transcription_config.client_wrapper = client_wrapper
|
|
94
94
|
|
|
95
|
-
# Initialize client wrapper to properly set client_wrapper
|
|
96
|
-
client_wrapper = SyncAPIClient(config=client_config)
|
|
97
|
-
transcription_config.client_wrapper = client_wrapper
|
|
98
|
-
|
|
99
95
|
# Validate input is required
|
|
100
96
|
if not input and not media_config.input_path:
|
|
101
97
|
raise ValueError("Input is required. Provide input as positional argument or media.input_path.")
|
|
@@ -129,7 +125,7 @@ def transcribe(
|
|
|
129
125
|
if is_url:
|
|
130
126
|
# Download media first, then transcribe
|
|
131
127
|
safe_print(colorful.cyan(" Downloading media from URL..."))
|
|
132
|
-
from lattifai.
|
|
128
|
+
from lattifai.youtube import YouTubeDownloader
|
|
133
129
|
|
|
134
130
|
downloader = YouTubeDownloader()
|
|
135
131
|
input_path = asyncio.run(
|
|
@@ -170,7 +166,7 @@ def transcribe(
|
|
|
170
166
|
safe_print(colorful.cyan(f" Output: {final_output}"))
|
|
171
167
|
|
|
172
168
|
# Write output
|
|
173
|
-
transcriber.write(transcript, final_output, encoding="utf-8",
|
|
169
|
+
transcriber.write(transcript, final_output, encoding="utf-8", cache_event=False)
|
|
174
170
|
|
|
175
171
|
safe_print(colorful.green(f"🎉 Transcription completed: {final_output}"))
|
|
176
172
|
|