lattifai 1.2.2__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. lattifai/_init.py +20 -0
  2. lattifai/alignment/__init__.py +2 -3
  3. lattifai/alignment/lattice1_aligner.py +117 -4
  4. lattifai/alignment/lattice1_worker.py +47 -4
  5. lattifai/alignment/segmenter.py +3 -2
  6. lattifai/alignment/text_align.py +2 -1
  7. lattifai/alignment/tokenizer.py +56 -29
  8. lattifai/audio2.py +162 -183
  9. lattifai/cli/alignment.py +5 -0
  10. lattifai/cli/caption.py +6 -6
  11. lattifai/cli/transcribe.py +1 -5
  12. lattifai/cli/youtube.py +3 -0
  13. lattifai/client.py +41 -12
  14. lattifai/config/__init__.py +21 -3
  15. lattifai/config/alignment.py +7 -0
  16. lattifai/config/caption.py +13 -243
  17. lattifai/config/client.py +16 -0
  18. lattifai/config/event.py +102 -0
  19. lattifai/config/transcription.py +25 -1
  20. lattifai/data/__init__.py +8 -0
  21. lattifai/data/caption.py +228 -0
  22. lattifai/errors.py +78 -53
  23. lattifai/event/__init__.py +65 -0
  24. lattifai/event/lattifai.py +166 -0
  25. lattifai/mixin.py +22 -17
  26. lattifai/transcription/base.py +2 -1
  27. lattifai/transcription/gemini.py +147 -16
  28. lattifai/transcription/lattifai.py +8 -11
  29. lattifai/types.py +1 -1
  30. lattifai/youtube/client.py +143 -48
  31. {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/METADATA +117 -54
  32. lattifai-1.3.0.dist-info/RECORD +57 -0
  33. lattifai/__init__.py +0 -88
  34. lattifai/alignment/sentence_splitter.py +0 -350
  35. lattifai/caption/__init__.py +0 -96
  36. lattifai/caption/caption.py +0 -661
  37. lattifai/caption/formats/__init__.py +0 -199
  38. lattifai/caption/formats/base.py +0 -211
  39. lattifai/caption/formats/gemini.py +0 -722
  40. lattifai/caption/formats/json.py +0 -194
  41. lattifai/caption/formats/lrc.py +0 -309
  42. lattifai/caption/formats/nle/__init__.py +0 -9
  43. lattifai/caption/formats/nle/audition.py +0 -561
  44. lattifai/caption/formats/nle/avid.py +0 -423
  45. lattifai/caption/formats/nle/fcpxml.py +0 -549
  46. lattifai/caption/formats/nle/premiere.py +0 -589
  47. lattifai/caption/formats/pysubs2.py +0 -642
  48. lattifai/caption/formats/sbv.py +0 -147
  49. lattifai/caption/formats/tabular.py +0 -338
  50. lattifai/caption/formats/textgrid.py +0 -193
  51. lattifai/caption/formats/ttml.py +0 -652
  52. lattifai/caption/formats/vtt.py +0 -469
  53. lattifai/caption/parsers/__init__.py +0 -9
  54. lattifai/caption/parsers/text_parser.py +0 -147
  55. lattifai/caption/standardize.py +0 -636
  56. lattifai/caption/supervision.py +0 -34
  57. lattifai/caption/utils.py +0 -474
  58. lattifai-1.2.2.dist-info/RECORD +0 -76
  59. {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/WHEEL +0 -0
  60. {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/entry_points.txt +0 -0
  61. {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/licenses/LICENSE +0 -0
  62. {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/top_level.txt +0 -0
lattifai/audio2.py CHANGED
@@ -162,168 +162,132 @@ class AudioLoader:
162
162
  sampling_rate: int,
163
163
  channel_selector: Optional[ChannelSelectorType],
164
164
  ) -> np.ndarray:
165
- """Load audio from file or binary stream and resample to target rate.
165
+ """Load audio from file or binary stream and resample to target rate."""
166
+ audio_source: Union[str, BinaryIO] = audio
167
+ audio_path: Optional[Path] = None
166
168
 
167
- Args:
168
- audio: Path to audio file or binary stream.
169
- sampling_rate: Target sampling rate.
170
- channel_selector: How to select channels.
171
-
172
- Returns:
173
- Resampled audio as a NumPy array of shape (channels, samples).
174
-
175
- Raises:
176
- ImportError: If PyAV is needed but not installed.
177
- ValueError: If no audio stream found.
178
- RuntimeError: If audio loading fails.
179
- """
180
169
  if isinstance(audio, Pathlike):
181
- audio = str(Path(str(audio)).expanduser())
170
+ audio_path = Path(str(audio)).expanduser()
171
+ audio_source = str(audio_path)
172
+
173
+ if audio_path and audio_path.suffix.lower() in [".mp4", ".m4a", ".aac", ".mov", ".webm", ".avi", ".mkv"]:
174
+ return self._load_audio_with_av(audio_source, sampling_rate, channel_selector)
182
175
 
183
- # load audio in chunks to reduce memory footprint for long files
184
176
  try:
185
- # First check file duration to decide loading strategy
186
- info = sf.info(audio)
187
- duration = info.duration
177
+ return self._load_audio_with_soundfile(audio_source, sampling_rate, channel_selector)
178
+ except Exception as primary_error:
179
+ print(f"Primary error with soundfile: {primary_error}")
180
+ return self._load_audio_with_av(audio_source, sampling_rate, channel_selector, primary_error)
188
181
 
189
- # For very long audio (>60 minutes), use chunk-based loading
190
- if duration > 3600: # 60 minutes
191
- with sf.SoundFile(audio, "r") as f:
192
- sample_rate = f.samplerate
193
- total_frames = f.frames
182
+ def _load_audio_with_soundfile(
183
+ self,
184
+ audio: Union[str, BinaryIO],
185
+ sampling_rate: int,
186
+ channel_selector: Optional[ChannelSelectorType],
187
+ ) -> np.ndarray:
188
+ """Load audio via soundfile with chunking support for long inputs."""
189
+ info = sf.info(audio)
190
+ duration = info.duration
194
191
 
195
- # Pre-calculate output size to avoid list accumulation
196
- num_channels = 1 if channel_selector else f.channels
197
- expected_output_samples = int(total_frames * sampling_rate / sample_rate)
192
+ if duration > 3600:
193
+ with sf.SoundFile(audio, "r") as f:
194
+ sample_rate = f.samplerate
195
+ total_frames = f.frames
198
196
 
199
- # Pre-allocate output array
200
- waveform = np.zeros((num_channels, expected_output_samples), dtype=np.float32)
197
+ num_channels = 1 if channel_selector else f.channels
198
+ expected_output_samples = int(total_frames * sampling_rate / sample_rate)
199
+ waveform = np.zeros((num_channels, expected_output_samples), dtype=np.float32)
201
200
 
202
- # Use source sample rate for reading, not target
203
- chunk_frames = int(sample_rate * 1800) # 30-minute chunks at source rate
204
- output_offset = 0
201
+ chunk_frames = int(sample_rate * 1800)
202
+ output_offset = 0
205
203
 
206
- while True:
207
- chunk = f.read(frames=chunk_frames, dtype="float32", always_2d=True)
208
- if chunk.size == 0:
209
- break
204
+ while True:
205
+ chunk = f.read(frames=chunk_frames, dtype="float32", always_2d=True)
206
+ if chunk.size == 0:
207
+ break
210
208
 
211
- # Resample chunk -> (channels, samples)
212
- resampled_chunk = self._resample_audio(
213
- (chunk, sample_rate),
214
- sampling_rate,
215
- device=self.device,
216
- channel_selector=channel_selector,
217
- )
209
+ resampled_chunk = self._resample_audio(
210
+ (chunk, sample_rate),
211
+ sampling_rate,
212
+ device=self.device,
213
+ channel_selector=channel_selector,
214
+ )
218
215
 
219
- # Write directly to pre-allocated array
220
- chunk_length = resampled_chunk.shape[-1]
221
- waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
222
- output_offset += chunk_length
216
+ chunk_length = resampled_chunk.shape[-1]
217
+ waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
218
+ output_offset += chunk_length
223
219
 
224
- # Clean up immediately
225
- del chunk, resampled_chunk
220
+ del chunk, resampled_chunk
221
+
222
+ if output_offset < expected_output_samples:
223
+ waveform = waveform[..., :output_offset]
226
224
 
227
- # Trim to actual size if needed (due to rounding in resampling)
228
- if output_offset < expected_output_samples:
229
- waveform = waveform[..., :output_offset]
225
+ return waveform
230
226
 
231
- return waveform
227
+ waveform, sample_rate = sf.read(audio, always_2d=True, dtype="float32")
228
+ result = self._resample_audio(
229
+ (waveform, sample_rate),
230
+ sampling_rate,
231
+ device=self.device,
232
+ channel_selector=channel_selector,
233
+ )
234
+ del waveform
235
+ return result
236
+
237
+ def _load_audio_with_av(
238
+ self,
239
+ audio: Union[str, BinaryIO],
240
+ sampling_rate: int,
241
+ channel_selector: Optional[ChannelSelectorType],
242
+ primary_error: Optional[Exception] = None,
243
+ ) -> np.ndarray:
244
+ """Load audio via PyAV when soundfile is unavailable or unsuitable."""
245
+ try:
246
+ import av
247
+ except ImportError as exc: # pragma: no cover
248
+ message = "PyAV (av) is required for loading certain audio formats. Install it with: pip install av"
249
+ if primary_error:
250
+ message = f"{message}\nPrimary error was: {primary_error}"
251
+ raise AudioLoadError(message) from exc
252
+
253
+ try:
254
+ container = av.open(audio)
255
+ audio_stream = next((s for s in container.streams if s.type == "audio"), None)
256
+
257
+ if audio_stream is None:
258
+ raise ValueError(f"No audio stream found in file: {audio}")
259
+
260
+ audio_stream.codec_context.format = av.AudioFormat("flt")
261
+ sample_rate = audio_stream.codec_context.sample_rate
262
+
263
+ duration_estimate = None
264
+ if audio_stream.duration and audio_stream.time_base:
265
+ duration_estimate = float(audio_stream.duration * audio_stream.time_base)
232
266
  else:
233
- # For shorter audio, use standard loading
234
- waveform, sample_rate = sf.read(audio, always_2d=True, dtype="float32")
235
- # Resample and return directly to avoid double processing
236
- result = self._resample_audio(
237
- (waveform, sample_rate),
238
- sampling_rate,
239
- device=self.device,
240
- channel_selector=channel_selector,
241
- )
242
- del waveform
243
- return result
244
- except Exception as primary_error:
245
- print(f"Primary error with soundfile: {primary_error}")
246
- # Fallback to PyAV for formats not supported by soundfile
247
- try:
248
- import av
249
- except ImportError:
250
- raise AudioLoadError(
251
- "PyAV (av) is required for loading certain audio formats. "
252
- f"Install it with: pip install av\n"
253
- f"Primary error was: {primary_error}"
254
- )
255
-
256
- try:
257
- container = av.open(audio)
258
- audio_stream = next((s for s in container.streams if s.type == "audio"), None)
259
-
260
- if audio_stream is None:
261
- raise ValueError(f"No audio stream found in file: {audio}")
262
-
263
- audio_stream.codec_context.format = av.AudioFormat("flt") # 32-bit float
264
- sample_rate = audio_stream.codec_context.sample_rate
265
-
266
- # Estimate duration to decide processing strategy
267
- duration_estimate = None
268
- if audio_stream.duration and audio_stream.time_base:
269
- duration_estimate = float(audio_stream.duration * audio_stream.time_base)
270
- else:
271
- print(f"WARNING: Failed to estimate duration for audio: {audio}")
272
-
273
- # For very long audio (>30 minutes), process and resample in chunks
274
- if duration_estimate and duration_estimate > 1800:
275
- # Estimate output size and pre-allocate with buffer
276
- num_channels = 1 if channel_selector else audio_stream.codec_context.channels
277
- estimated_samples = int(duration_estimate * sampling_rate * 1.1) # 10% buffer
278
- waveform = np.zeros((num_channels, estimated_samples), dtype=np.float32)
279
-
280
- frames = []
281
- accumulated_samples = 0
282
- output_offset = 0
283
- chunk_sample_target = int(sample_rate * 600) # 10 minutes at original rate
284
-
285
- for frame in container.decode(audio_stream):
286
- array = frame.to_ndarray()
287
-
288
- # Ensure shape is (samples, channels)
289
- if array.ndim == 1:
290
- array = array.reshape(-1, 1)
291
- elif array.ndim == 2 and array.shape[0] < array.shape[1]:
292
- array = array.T
293
-
294
- frames.append(array)
295
- accumulated_samples += array.shape[0]
296
-
297
- # Process chunk when accumulated enough samples
298
- if accumulated_samples >= chunk_sample_target:
299
- chunk = np.concatenate(frames, axis=0).astype(np.float32)
300
- del frames # Free frames list before resampling
301
- # Resample chunk -> (channels, samples)
302
- resampled_chunk = self._resample_audio(
303
- (chunk, sample_rate),
304
- sampling_rate,
305
- device=self.device,
306
- channel_selector=channel_selector,
307
- )
308
-
309
- chunk_length = resampled_chunk.shape[-1]
310
- if output_offset + chunk_length > waveform.shape[-1]:
311
- print(
312
- f"WARNING: Trimming resampled chunk from {chunk_length} to {waveform.shape[-1] - output_offset} samples to fit waveform buffer for audio: {audio}" # noqa: E501
313
- )
314
- resampled_chunk = resampled_chunk[:, : waveform.shape[-1] - output_offset]
315
-
316
- # Write directly to array
317
- waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
318
- output_offset += chunk_length
319
-
320
- # Clean up immediately
321
- del chunk, resampled_chunk
322
- frames = [] # Create new list
323
- accumulated_samples = 0
324
-
325
- # Process remaining frames
326
- if frames:
267
+ print(f"WARNING: Failed to estimate duration for audio: {audio}")
268
+
269
+ if duration_estimate and duration_estimate > 1800:
270
+ num_channels = 1 if channel_selector else audio_stream.codec_context.channels
271
+ estimated_samples = int(duration_estimate * sampling_rate * 1.1)
272
+ waveform = np.zeros((num_channels, estimated_samples), dtype=np.float32)
273
+
274
+ frames = []
275
+ accumulated_samples = 0
276
+ output_offset = 0
277
+ chunk_sample_target = int(sample_rate * 600)
278
+
279
+ for frame in container.decode(audio_stream):
280
+ array = frame.to_ndarray()
281
+
282
+ if array.ndim == 1:
283
+ array = array.reshape(-1, 1)
284
+ elif array.ndim == 2 and array.shape[0] < array.shape[1]:
285
+ array = array.T
286
+
287
+ frames.append(array)
288
+ accumulated_samples += array.shape[0]
289
+
290
+ if accumulated_samples >= chunk_sample_target:
327
291
  chunk = np.concatenate(frames, axis=0).astype(np.float32)
328
292
  del frames
329
293
  resampled_chunk = self._resample_audio(
@@ -335,53 +299,68 @@ class AudioLoader:
335
299
 
336
300
  chunk_length = resampled_chunk.shape[-1]
337
301
  if output_offset + chunk_length > waveform.shape[-1]:
338
- print(
339
- f"WARNING: Trimming resampled chunk from {chunk_length} to {waveform.shape[-1] - output_offset} samples to fit waveform buffer for audio: {audio}" # noqa: E501
340
- )
302
+ print("WARNING: Trimming resampled chunk to fit waveform buffer for audio: " f"{audio}")
341
303
  resampled_chunk = resampled_chunk[:, : waveform.shape[-1] - output_offset]
342
304
 
343
305
  waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
344
306
  output_offset += chunk_length
345
- del chunk, resampled_chunk
346
-
347
- container.close()
348
307
 
349
- if output_offset == 0:
350
- raise ValueError(f"No audio data found in file: {audio}")
308
+ del chunk, resampled_chunk
309
+ frames = []
310
+ accumulated_samples = 0
351
311
 
352
- # Trim to actual size
353
- waveform = waveform[..., :output_offset]
354
- return waveform
355
- else:
356
- # For shorter audio, process in batches to reduce memory
357
- frames = []
358
- for frame in container.decode(audio_stream):
359
- array = frame.to_ndarray()
360
- # Ensure shape is (channels, samples)
361
- if array.ndim == 1:
362
- array = array.reshape(-1, 1)
363
- elif array.ndim == 2 and array.shape[0] < array.shape[1]:
364
- array = array.T
365
- frames.append(array)
366
- container.close()
367
-
368
- if not frames:
369
- raise ValueError(f"No audio data found in file: {audio}")
370
-
371
- # Concatenate remaining frames
372
- waveform = np.concatenate(frames, axis=0).astype(np.float32)
312
+ if frames:
313
+ chunk = np.concatenate(frames, axis=0).astype(np.float32)
373
314
  del frames
374
- # Resample and return directly
375
- result = self._resample_audio(
376
- (waveform, sample_rate),
315
+ resampled_chunk = self._resample_audio(
316
+ (chunk, sample_rate),
377
317
  sampling_rate,
378
318
  device=self.device,
379
319
  channel_selector=channel_selector,
380
320
  )
381
- del waveform
382
- return result
383
- except Exception as e:
384
- raise RuntimeError(f"Failed to load audio file {audio}: {e}")
321
+
322
+ chunk_length = resampled_chunk.shape[-1]
323
+ if output_offset + chunk_length > waveform.shape[-1]:
324
+ print("WARNING: Trimming resampled chunk to fit waveform buffer for audio: " f"{audio}")
325
+ resampled_chunk = resampled_chunk[:, : waveform.shape[-1] - output_offset]
326
+
327
+ waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
328
+ output_offset += chunk_length
329
+ del chunk, resampled_chunk
330
+
331
+ container.close()
332
+
333
+ if output_offset == 0:
334
+ raise ValueError(f"No audio data found in file: {audio}")
335
+
336
+ waveform = waveform[..., :output_offset]
337
+ return waveform
338
+
339
+ frames = []
340
+ for frame in container.decode(audio_stream):
341
+ array = frame.to_ndarray()
342
+ if array.ndim == 1:
343
+ array = array.reshape(-1, 1)
344
+ elif array.ndim == 2 and array.shape[0] < array.shape[1]:
345
+ array = array.T
346
+ frames.append(array)
347
+ container.close()
348
+
349
+ if not frames:
350
+ raise ValueError(f"No audio data found in file: {audio}")
351
+
352
+ waveform = np.concatenate(frames, axis=0).astype(np.float32)
353
+ del frames
354
+ result = self._resample_audio(
355
+ (waveform, sample_rate),
356
+ sampling_rate,
357
+ device=self.device,
358
+ channel_selector=channel_selector,
359
+ )
360
+ del waveform
361
+ return result
362
+ except Exception as exc:
363
+ raise RuntimeError(f"Failed to load audio file {audio}: {exc}")
385
364
 
386
365
  def __call__(
387
366
  self,
lattifai/cli/alignment.py CHANGED
@@ -1,5 +1,6 @@
1
1
  """Alignment CLI entry point with nemo_run."""
2
2
 
3
+ import sys
3
4
  from typing import Optional
4
5
 
5
6
  import nemo_run as run
@@ -12,9 +13,11 @@ from lattifai.config import (
12
13
  CaptionConfig,
13
14
  ClientConfig,
14
15
  DiarizationConfig,
16
+ EventConfig,
15
17
  MediaConfig,
16
18
  TranscriptionConfig,
17
19
  )
20
+ from lattifai.errors import LattifAIError
18
21
 
19
22
  __all__ = ["align"]
20
23
 
@@ -30,6 +33,7 @@ def align(
30
33
  alignment: Annotated[Optional[AlignmentConfig], run.Config[AlignmentConfig]] = None,
31
34
  transcription: Annotated[Optional[TranscriptionConfig], run.Config[TranscriptionConfig]] = None,
32
35
  diarization: Annotated[Optional[DiarizationConfig], run.Config[DiarizationConfig]] = None,
36
+ event: Annotated[Optional[EventConfig], run.Config[EventConfig]] = None,
33
37
  ):
34
38
  """
35
39
  Align audio/video with caption file.
@@ -121,6 +125,7 @@ def align(
121
125
  caption_config=caption_config,
122
126
  transcription_config=transcription,
123
127
  diarization_config=diarization,
128
+ event_config=event,
124
129
  )
125
130
 
126
131
  is_url = media_config.input_path.startswith(("http://", "https://"))
lattifai/cli/caption.py CHANGED
@@ -6,8 +6,8 @@ import nemo_run as run
6
6
  from lhotse.utils import Pathlike
7
7
  from typing_extensions import Annotated
8
8
 
9
+ from lattifai.caption.config import KaraokeConfig
9
10
  from lattifai.config import CaptionConfig
10
- from lattifai.config.caption import KaraokeConfig
11
11
  from lattifai.utils import safe_print
12
12
 
13
13
 
@@ -68,7 +68,7 @@ def convert(
68
68
  input_path=input.srt \\
69
69
  output_path=output.TextGrid
70
70
  """
71
- from lattifai.caption import Caption
71
+ from lattifai.data import Caption
72
72
 
73
73
  # Create karaoke_config if karaoke flag is set
74
74
  karaoke_config = KaraokeConfig(enabled=True) if karaoke else None
@@ -121,7 +121,7 @@ def normalize(
121
121
  """
122
122
  from pathlib import Path
123
123
 
124
- from lattifai.caption import Caption
124
+ from lattifai.data import Caption
125
125
 
126
126
  input_path = Path(input_path).expanduser()
127
127
  output_path = Path(output_path).expanduser()
@@ -176,7 +176,7 @@ def shift(
176
176
  """
177
177
  from pathlib import Path
178
178
 
179
- from lattifai.caption import Caption
179
+ from lattifai.data import Caption
180
180
 
181
181
  input_path = Path(input_path).expanduser()
182
182
  output_path = Path(output_path).expanduser()
@@ -235,9 +235,9 @@ def diff(
235
235
  """
236
236
  from pathlib import Path
237
237
 
238
- from lattifai.alignment.sentence_splitter import SentenceSplitter
239
238
  from lattifai.alignment.text_align import align_supervisions_and_transcription
240
- from lattifai.caption import Caption
239
+ from lattifai.caption import SentenceSplitter
240
+ from lattifai.data import Caption
241
241
 
242
242
  ref_path = Path(ref_path).expanduser()
243
243
  hyp_path = Path(hyp_path).expanduser()
@@ -92,10 +92,6 @@ def transcribe(
92
92
  client_wrapper = SyncAPIClient(config=client_config)
93
93
  transcription_config.client_wrapper = client_wrapper
94
94
 
95
- # Initialize client wrapper to properly set client_wrapper
96
- client_wrapper = SyncAPIClient(config=client_config)
97
- transcription_config.client_wrapper = client_wrapper
98
-
99
95
  # Validate input is required
100
96
  if not input and not media_config.input_path:
101
97
  raise ValueError("Input is required. Provide input as positional argument or media.input_path.")
@@ -170,7 +166,7 @@ def transcribe(
170
166
  safe_print(colorful.cyan(f" Output: {final_output}"))
171
167
 
172
168
  # Write output
173
- transcriber.write(transcript, final_output, encoding="utf-8", cache_audio_events=False)
169
+ transcriber.write(transcript, final_output, encoding="utf-8", cache_event=False)
174
170
 
175
171
  safe_print(colorful.green(f"🎉 Transcription completed: {final_output}"))
176
172
 
lattifai/cli/youtube.py CHANGED
@@ -11,6 +11,7 @@ from lattifai.config import (
11
11
  CaptionConfig,
12
12
  ClientConfig,
13
13
  DiarizationConfig,
14
+ EventConfig,
14
15
  MediaConfig,
15
16
  TranscriptionConfig,
16
17
  )
@@ -25,6 +26,7 @@ def youtube(
25
26
  caption: Annotated[Optional[CaptionConfig], run.Config[CaptionConfig]] = None,
26
27
  transcription: Annotated[Optional[TranscriptionConfig], run.Config[TranscriptionConfig]] = None,
27
28
  diarization: Annotated[Optional[DiarizationConfig], run.Config[DiarizationConfig]] = None,
29
+ event: Annotated[Optional[EventConfig], run.Config[EventConfig]] = None,
28
30
  use_transcription: bool = False,
29
31
  ):
30
32
  """
@@ -114,6 +116,7 @@ def youtube(
114
116
  caption_config=caption_config,
115
117
  transcription_config=transcription,
116
118
  diarization_config=diarization,
119
+ event_config=event,
117
120
  )
118
121
 
119
122
  # Call the client's youtube method
lattifai/client.py CHANGED
@@ -7,10 +7,18 @@ import colorful
7
7
  from lattifai_core.client import SyncAPIClient
8
8
  from lhotse.utils import Pathlike
9
9
 
10
- from lattifai.alignment import Lattice1Aligner, Segmenter, align_supervisions_and_transcription
10
+ from lattifai.alignment import Lattice1Aligner, Segmenter
11
11
  from lattifai.audio2 import AudioData, AudioLoader
12
- from lattifai.caption import Caption, InputCaptionFormat
13
- from lattifai.config import AlignmentConfig, CaptionConfig, ClientConfig, DiarizationConfig, TranscriptionConfig
12
+ from lattifai.caption import InputCaptionFormat
13
+ from lattifai.config import (
14
+ AlignmentConfig,
15
+ CaptionConfig,
16
+ ClientConfig,
17
+ DiarizationConfig,
18
+ EventConfig,
19
+ TranscriptionConfig,
20
+ )
21
+ from lattifai.data import Caption
14
22
  from lattifai.errors import (
15
23
  AlignmentError,
16
24
  CaptionProcessingError,
@@ -22,6 +30,7 @@ from lattifai.utils import safe_print
22
30
 
23
31
  if TYPE_CHECKING:
24
32
  from lattifai.diarization import LattifAIDiarizer # noqa: F401
33
+ from lattifai.event import LattifAIEventDetector # noqa: F401
25
34
 
26
35
 
27
36
  class LattifAI(LattifAIClientMixin, SyncAPIClient):
@@ -41,6 +50,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
41
50
  caption_config: Optional[CaptionConfig] = None,
42
51
  transcription_config: Optional[TranscriptionConfig] = None,
43
52
  diarization_config: Optional[DiarizationConfig] = None,
53
+ event_config: Optional[EventConfig] = None,
44
54
  ) -> None:
45
55
  __doc__ = LattifAIClientMixin._INIT_DOC.format(
46
56
  client_class="LattifAI",
@@ -59,8 +69,8 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
59
69
  self.config = client_config
60
70
 
61
71
  # Initialize all configs with defaults
62
- alignment_config, transcription_config, diarization_config = self._init_configs(
63
- alignment_config, transcription_config, diarization_config
72
+ alignment_config, transcription_config, diarization_config, event_config = self._init_configs(
73
+ alignment_config, transcription_config, diarization_config, event_config
64
74
  )
65
75
 
66
76
  # Store configs
@@ -82,6 +92,14 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
82
92
 
83
93
  self.diarizer = LattifAIDiarizer(config=self.diarization_config)
84
94
 
95
+ # Initialize event detector if enabled
96
+ self.event_config = event_config
97
+ self.event_detector: Optional["LattifAIEventDetector"] = None
98
+ if self.event_config.enabled:
99
+ from lattifai.event import LattifAIEventDetector # noqa: F811
100
+
101
+ self.event_detector = LattifAIEventDetector(config=self.event_config)
102
+
85
103
  # Initialize shared components (transcriber, downloader)
86
104
  self._init_shared_components(transcription_config)
87
105
 
@@ -126,6 +144,8 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
126
144
  safe_print(colorful.cyan(f"🔄 Using segmented alignment strategy: {alignment_strategy}"))
127
145
 
128
146
  if caption.supervisions and alignment_strategy == "transcription":
147
+ from lattifai.alignment.text_align import align_supervisions_and_transcription
148
+
129
149
  if "gemini" in self.transcriber.name.lower():
130
150
  raise ValueError(
131
151
  f"Transcription-based alignment is not supported for {self.transcriber.name} "
@@ -139,7 +159,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
139
159
  output_dir=Path(str(output_caption_path)).parent if output_caption_path else None,
140
160
  )
141
161
  caption.transcription = transcript.supervisions or transcript.transcription
142
- caption.audio_events = transcript.audio_events
162
+ caption.event = transcript.event
143
163
  if not caption.transcription:
144
164
  raise ValueError("Transcription is empty after transcription step.")
145
165
 
@@ -234,13 +254,15 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
234
254
  if self.config.profile:
235
255
  self.aligner.profile()
236
256
 
237
- except (CaptionProcessingError, LatticeEncodingError, AlignmentError, LatticeDecodingError):
257
+ except (CaptionProcessingError, LatticeEncodingError) as e:
238
258
  # Re-raise our specific errors as-is
239
- raise
259
+ raise e
260
+ except LatticeDecodingError as e:
261
+ raise e
240
262
  except Exception as e:
241
263
  # Catch any unexpected errors and wrap them
242
264
  raise AlignmentError(
243
- "Unexpected error during alignment process",
265
+ message="Unexpected error during alignment process",
244
266
  media_path=str(input_media),
245
267
  caption_path=str(input_caption),
246
268
  context={"original_error": str(e), "error_type": e.__class__.__name__},
@@ -255,6 +277,13 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
255
277
  output_caption_path=output_caption_path,
256
278
  )
257
279
 
280
+ # Step 6: Event detection
281
+ if self.event_config.enabled and self.event_detector:
282
+ safe_print(colorful.cyan("🔊 Performing audio event detection..."))
283
+ caption = self.event_detector.detect_and_update_caption(caption, media_audio)
284
+ if output_caption_path:
285
+ self._write_caption(caption, output_caption_path)
286
+
258
287
  return caption
259
288
 
260
289
  def speaker_diarization(
@@ -285,12 +314,12 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
285
314
  diarization_file = Path(str(output_caption_path)).with_suffix(".SpkDiar")
286
315
  if diarization_file.exists():
287
316
  safe_print(colorful.cyan(f"Reading existing speaker diarization from {diarization_file}"))
288
- caption.read_speaker_diarization(diarization_file)
317
+ caption.read_diarization(diarization_file)
289
318
 
290
319
  diarization, alignments = self.diarizer.diarize_with_alignments(
291
320
  input_media,
292
321
  caption.alignments,
293
- diarization=caption.speaker_diarization,
322
+ diarization=caption.diarization,
294
323
  alignment_fn=self.aligner.alignment,
295
324
  transcribe_fn=self.transcriber.transcribe_numpy if self.transcriber else None,
296
325
  separate_fn=self.aligner.separate if self.aligner.worker.separator_ort else None,
@@ -298,7 +327,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
298
327
  output_path=output_caption_path,
299
328
  )
300
329
  caption.alignments = alignments
301
- caption.speaker_diarization = diarization
330
+ caption.diarization = diarization
302
331
 
303
332
  # Write output if requested
304
333
  if output_caption_path: