lattifai 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. lattifai/_init.py +20 -0
  2. lattifai/alignment/__init__.py +9 -1
  3. lattifai/alignment/lattice1_aligner.py +175 -54
  4. lattifai/alignment/lattice1_worker.py +47 -4
  5. lattifai/alignment/punctuation.py +38 -0
  6. lattifai/alignment/segmenter.py +3 -2
  7. lattifai/alignment/text_align.py +441 -0
  8. lattifai/alignment/tokenizer.py +134 -65
  9. lattifai/audio2.py +162 -183
  10. lattifai/cli/__init__.py +2 -1
  11. lattifai/cli/alignment.py +5 -0
  12. lattifai/cli/caption.py +111 -4
  13. lattifai/cli/transcribe.py +2 -6
  14. lattifai/cli/youtube.py +7 -1
  15. lattifai/client.py +72 -123
  16. lattifai/config/__init__.py +28 -0
  17. lattifai/config/alignment.py +14 -0
  18. lattifai/config/caption.py +45 -31
  19. lattifai/config/client.py +16 -0
  20. lattifai/config/event.py +102 -0
  21. lattifai/config/media.py +20 -0
  22. lattifai/config/transcription.py +25 -1
  23. lattifai/data/__init__.py +8 -0
  24. lattifai/data/caption.py +228 -0
  25. lattifai/diarization/__init__.py +41 -1
  26. lattifai/errors.py +78 -53
  27. lattifai/event/__init__.py +65 -0
  28. lattifai/event/lattifai.py +166 -0
  29. lattifai/mixin.py +49 -32
  30. lattifai/transcription/base.py +8 -2
  31. lattifai/transcription/gemini.py +147 -16
  32. lattifai/transcription/lattifai.py +25 -63
  33. lattifai/types.py +1 -1
  34. lattifai/utils.py +7 -13
  35. lattifai/workflow/__init__.py +28 -4
  36. lattifai/workflow/file_manager.py +2 -5
  37. lattifai/youtube/__init__.py +43 -0
  38. lattifai/youtube/client.py +1265 -0
  39. lattifai/youtube/types.py +23 -0
  40. lattifai-1.3.0.dist-info/METADATA +678 -0
  41. lattifai-1.3.0.dist-info/RECORD +57 -0
  42. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/entry_points.txt +1 -2
  43. lattifai/__init__.py +0 -88
  44. lattifai/alignment/sentence_splitter.py +0 -219
  45. lattifai/caption/__init__.py +0 -20
  46. lattifai/caption/caption.py +0 -1467
  47. lattifai/caption/gemini_reader.py +0 -462
  48. lattifai/caption/gemini_writer.py +0 -173
  49. lattifai/caption/supervision.py +0 -34
  50. lattifai/caption/text_parser.py +0 -145
  51. lattifai/cli/app_installer.py +0 -142
  52. lattifai/cli/server.py +0 -44
  53. lattifai/server/app.py +0 -427
  54. lattifai/workflow/youtube.py +0 -577
  55. lattifai-1.2.1.dist-info/METADATA +0 -1134
  56. lattifai-1.2.1.dist-info/RECORD +0 -58
  57. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/WHEEL +0 -0
  58. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/licenses/LICENSE +0 -0
  59. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/top_level.txt +0 -0
lattifai/audio2.py CHANGED
@@ -162,168 +162,132 @@ class AudioLoader:
162
162
  sampling_rate: int,
163
163
  channel_selector: Optional[ChannelSelectorType],
164
164
  ) -> np.ndarray:
165
- """Load audio from file or binary stream and resample to target rate.
165
+ """Load audio from file or binary stream and resample to target rate."""
166
+ audio_source: Union[str, BinaryIO] = audio
167
+ audio_path: Optional[Path] = None
166
168
 
167
- Args:
168
- audio: Path to audio file or binary stream.
169
- sampling_rate: Target sampling rate.
170
- channel_selector: How to select channels.
171
-
172
- Returns:
173
- Resampled audio as a NumPy array of shape (channels, samples).
174
-
175
- Raises:
176
- ImportError: If PyAV is needed but not installed.
177
- ValueError: If no audio stream found.
178
- RuntimeError: If audio loading fails.
179
- """
180
169
  if isinstance(audio, Pathlike):
181
- audio = str(Path(str(audio)).expanduser())
170
+ audio_path = Path(str(audio)).expanduser()
171
+ audio_source = str(audio_path)
172
+
173
+ if audio_path and audio_path.suffix.lower() in [".mp4", ".m4a", ".aac", ".mov", ".webm", ".avi", ".mkv"]:
174
+ return self._load_audio_with_av(audio_source, sampling_rate, channel_selector)
182
175
 
183
- # load audio in chunks to reduce memory footprint for long files
184
176
  try:
185
- # First check file duration to decide loading strategy
186
- info = sf.info(audio)
187
- duration = info.duration
177
+ return self._load_audio_with_soundfile(audio_source, sampling_rate, channel_selector)
178
+ except Exception as primary_error:
179
+ print(f"Primary error with soundfile: {primary_error}")
180
+ return self._load_audio_with_av(audio_source, sampling_rate, channel_selector, primary_error)
188
181
 
189
- # For very long audio (>60 minutes), use chunk-based loading
190
- if duration > 3600: # 60 minutes
191
- with sf.SoundFile(audio, "r") as f:
192
- sample_rate = f.samplerate
193
- total_frames = f.frames
182
+ def _load_audio_with_soundfile(
183
+ self,
184
+ audio: Union[str, BinaryIO],
185
+ sampling_rate: int,
186
+ channel_selector: Optional[ChannelSelectorType],
187
+ ) -> np.ndarray:
188
+ """Load audio via soundfile with chunking support for long inputs."""
189
+ info = sf.info(audio)
190
+ duration = info.duration
194
191
 
195
- # Pre-calculate output size to avoid list accumulation
196
- num_channels = 1 if channel_selector else f.channels
197
- expected_output_samples = int(total_frames * sampling_rate / sample_rate)
192
+ if duration > 3600:
193
+ with sf.SoundFile(audio, "r") as f:
194
+ sample_rate = f.samplerate
195
+ total_frames = f.frames
198
196
 
199
- # Pre-allocate output array
200
- waveform = np.zeros((num_channels, expected_output_samples), dtype=np.float32)
197
+ num_channels = 1 if channel_selector else f.channels
198
+ expected_output_samples = int(total_frames * sampling_rate / sample_rate)
199
+ waveform = np.zeros((num_channels, expected_output_samples), dtype=np.float32)
201
200
 
202
- # Use source sample rate for reading, not target
203
- chunk_frames = int(sample_rate * 1800) # 30-minute chunks at source rate
204
- output_offset = 0
201
+ chunk_frames = int(sample_rate * 1800)
202
+ output_offset = 0
205
203
 
206
- while True:
207
- chunk = f.read(frames=chunk_frames, dtype="float32", always_2d=True)
208
- if chunk.size == 0:
209
- break
204
+ while True:
205
+ chunk = f.read(frames=chunk_frames, dtype="float32", always_2d=True)
206
+ if chunk.size == 0:
207
+ break
210
208
 
211
- # Resample chunk -> (channels, samples)
212
- resampled_chunk = self._resample_audio(
213
- (chunk, sample_rate),
214
- sampling_rate,
215
- device=self.device,
216
- channel_selector=channel_selector,
217
- )
209
+ resampled_chunk = self._resample_audio(
210
+ (chunk, sample_rate),
211
+ sampling_rate,
212
+ device=self.device,
213
+ channel_selector=channel_selector,
214
+ )
218
215
 
219
- # Write directly to pre-allocated array
220
- chunk_length = resampled_chunk.shape[-1]
221
- waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
222
- output_offset += chunk_length
216
+ chunk_length = resampled_chunk.shape[-1]
217
+ waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
218
+ output_offset += chunk_length
223
219
 
224
- # Clean up immediately
225
- del chunk, resampled_chunk
220
+ del chunk, resampled_chunk
221
+
222
+ if output_offset < expected_output_samples:
223
+ waveform = waveform[..., :output_offset]
226
224
 
227
- # Trim to actual size if needed (due to rounding in resampling)
228
- if output_offset < expected_output_samples:
229
- waveform = waveform[..., :output_offset]
225
+ return waveform
230
226
 
231
- return waveform
227
+ waveform, sample_rate = sf.read(audio, always_2d=True, dtype="float32")
228
+ result = self._resample_audio(
229
+ (waveform, sample_rate),
230
+ sampling_rate,
231
+ device=self.device,
232
+ channel_selector=channel_selector,
233
+ )
234
+ del waveform
235
+ return result
236
+
237
+ def _load_audio_with_av(
238
+ self,
239
+ audio: Union[str, BinaryIO],
240
+ sampling_rate: int,
241
+ channel_selector: Optional[ChannelSelectorType],
242
+ primary_error: Optional[Exception] = None,
243
+ ) -> np.ndarray:
244
+ """Load audio via PyAV when soundfile is unavailable or unsuitable."""
245
+ try:
246
+ import av
247
+ except ImportError as exc: # pragma: no cover
248
+ message = "PyAV (av) is required for loading certain audio formats. Install it with: pip install av"
249
+ if primary_error:
250
+ message = f"{message}\nPrimary error was: {primary_error}"
251
+ raise AudioLoadError(message) from exc
252
+
253
+ try:
254
+ container = av.open(audio)
255
+ audio_stream = next((s for s in container.streams if s.type == "audio"), None)
256
+
257
+ if audio_stream is None:
258
+ raise ValueError(f"No audio stream found in file: {audio}")
259
+
260
+ audio_stream.codec_context.format = av.AudioFormat("flt")
261
+ sample_rate = audio_stream.codec_context.sample_rate
262
+
263
+ duration_estimate = None
264
+ if audio_stream.duration and audio_stream.time_base:
265
+ duration_estimate = float(audio_stream.duration * audio_stream.time_base)
232
266
  else:
233
- # For shorter audio, use standard loading
234
- waveform, sample_rate = sf.read(audio, always_2d=True, dtype="float32")
235
- # Resample and return directly to avoid double processing
236
- result = self._resample_audio(
237
- (waveform, sample_rate),
238
- sampling_rate,
239
- device=self.device,
240
- channel_selector=channel_selector,
241
- )
242
- del waveform
243
- return result
244
- except Exception as primary_error:
245
- print(f"Primary error with soundfile: {primary_error}")
246
- # Fallback to PyAV for formats not supported by soundfile
247
- try:
248
- import av
249
- except ImportError:
250
- raise AudioLoadError(
251
- "PyAV (av) is required for loading certain audio formats. "
252
- f"Install it with: pip install av\n"
253
- f"Primary error was: {primary_error}"
254
- )
255
-
256
- try:
257
- container = av.open(audio)
258
- audio_stream = next((s for s in container.streams if s.type == "audio"), None)
259
-
260
- if audio_stream is None:
261
- raise ValueError(f"No audio stream found in file: {audio}")
262
-
263
- audio_stream.codec_context.format = av.AudioFormat("flt") # 32-bit float
264
- sample_rate = audio_stream.codec_context.sample_rate
265
-
266
- # Estimate duration to decide processing strategy
267
- duration_estimate = None
268
- if audio_stream.duration and audio_stream.time_base:
269
- duration_estimate = float(audio_stream.duration * audio_stream.time_base)
270
- else:
271
- print(f"WARNING: Failed to estimate duration for audio: {audio}")
272
-
273
- # For very long audio (>30 minutes), process and resample in chunks
274
- if duration_estimate and duration_estimate > 1800:
275
- # Estimate output size and pre-allocate with buffer
276
- num_channels = 1 if channel_selector else audio_stream.codec_context.channels
277
- estimated_samples = int(duration_estimate * sampling_rate * 1.1) # 10% buffer
278
- waveform = np.zeros((num_channels, estimated_samples), dtype=np.float32)
279
-
280
- frames = []
281
- accumulated_samples = 0
282
- output_offset = 0
283
- chunk_sample_target = int(sample_rate * 600) # 10 minutes at original rate
284
-
285
- for frame in container.decode(audio_stream):
286
- array = frame.to_ndarray()
287
-
288
- # Ensure shape is (samples, channels)
289
- if array.ndim == 1:
290
- array = array.reshape(-1, 1)
291
- elif array.ndim == 2 and array.shape[0] < array.shape[1]:
292
- array = array.T
293
-
294
- frames.append(array)
295
- accumulated_samples += array.shape[0]
296
-
297
- # Process chunk when accumulated enough samples
298
- if accumulated_samples >= chunk_sample_target:
299
- chunk = np.concatenate(frames, axis=0).astype(np.float32)
300
- del frames # Free frames list before resampling
301
- # Resample chunk -> (channels, samples)
302
- resampled_chunk = self._resample_audio(
303
- (chunk, sample_rate),
304
- sampling_rate,
305
- device=self.device,
306
- channel_selector=channel_selector,
307
- )
308
-
309
- chunk_length = resampled_chunk.shape[-1]
310
- if output_offset + chunk_length > waveform.shape[-1]:
311
- print(
312
- f"WARNING: Trimming resampled chunk from {chunk_length} to {waveform.shape[-1] - output_offset} samples to fit waveform buffer for audio: {audio}" # noqa: E501
313
- )
314
- resampled_chunk = resampled_chunk[:, : waveform.shape[-1] - output_offset]
315
-
316
- # Write directly to array
317
- waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
318
- output_offset += chunk_length
319
-
320
- # Clean up immediately
321
- del chunk, resampled_chunk
322
- frames = [] # Create new list
323
- accumulated_samples = 0
324
-
325
- # Process remaining frames
326
- if frames:
267
+ print(f"WARNING: Failed to estimate duration for audio: {audio}")
268
+
269
+ if duration_estimate and duration_estimate > 1800:
270
+ num_channels = 1 if channel_selector else audio_stream.codec_context.channels
271
+ estimated_samples = int(duration_estimate * sampling_rate * 1.1)
272
+ waveform = np.zeros((num_channels, estimated_samples), dtype=np.float32)
273
+
274
+ frames = []
275
+ accumulated_samples = 0
276
+ output_offset = 0
277
+ chunk_sample_target = int(sample_rate * 600)
278
+
279
+ for frame in container.decode(audio_stream):
280
+ array = frame.to_ndarray()
281
+
282
+ if array.ndim == 1:
283
+ array = array.reshape(-1, 1)
284
+ elif array.ndim == 2 and array.shape[0] < array.shape[1]:
285
+ array = array.T
286
+
287
+ frames.append(array)
288
+ accumulated_samples += array.shape[0]
289
+
290
+ if accumulated_samples >= chunk_sample_target:
327
291
  chunk = np.concatenate(frames, axis=0).astype(np.float32)
328
292
  del frames
329
293
  resampled_chunk = self._resample_audio(
@@ -335,53 +299,68 @@ class AudioLoader:
335
299
 
336
300
  chunk_length = resampled_chunk.shape[-1]
337
301
  if output_offset + chunk_length > waveform.shape[-1]:
338
- print(
339
- f"WARNING: Trimming resampled chunk from {chunk_length} to {waveform.shape[-1] - output_offset} samples to fit waveform buffer for audio: {audio}" # noqa: E501
340
- )
302
+ print("WARNING: Trimming resampled chunk to fit waveform buffer for audio: " f"{audio}")
341
303
  resampled_chunk = resampled_chunk[:, : waveform.shape[-1] - output_offset]
342
304
 
343
305
  waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
344
306
  output_offset += chunk_length
345
- del chunk, resampled_chunk
346
-
347
- container.close()
348
307
 
349
- if output_offset == 0:
350
- raise ValueError(f"No audio data found in file: {audio}")
308
+ del chunk, resampled_chunk
309
+ frames = []
310
+ accumulated_samples = 0
351
311
 
352
- # Trim to actual size
353
- waveform = waveform[..., :output_offset]
354
- return waveform
355
- else:
356
- # For shorter audio, process in batches to reduce memory
357
- frames = []
358
- for frame in container.decode(audio_stream):
359
- array = frame.to_ndarray()
360
- # Ensure shape is (channels, samples)
361
- if array.ndim == 1:
362
- array = array.reshape(-1, 1)
363
- elif array.ndim == 2 and array.shape[0] < array.shape[1]:
364
- array = array.T
365
- frames.append(array)
366
- container.close()
367
-
368
- if not frames:
369
- raise ValueError(f"No audio data found in file: {audio}")
370
-
371
- # Concatenate remaining frames
372
- waveform = np.concatenate(frames, axis=0).astype(np.float32)
312
+ if frames:
313
+ chunk = np.concatenate(frames, axis=0).astype(np.float32)
373
314
  del frames
374
- # Resample and return directly
375
- result = self._resample_audio(
376
- (waveform, sample_rate),
315
+ resampled_chunk = self._resample_audio(
316
+ (chunk, sample_rate),
377
317
  sampling_rate,
378
318
  device=self.device,
379
319
  channel_selector=channel_selector,
380
320
  )
381
- del waveform
382
- return result
383
- except Exception as e:
384
- raise RuntimeError(f"Failed to load audio file {audio}: {e}")
321
+
322
+ chunk_length = resampled_chunk.shape[-1]
323
+ if output_offset + chunk_length > waveform.shape[-1]:
324
+ print("WARNING: Trimming resampled chunk to fit waveform buffer for audio: " f"{audio}")
325
+ resampled_chunk = resampled_chunk[:, : waveform.shape[-1] - output_offset]
326
+
327
+ waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
328
+ output_offset += chunk_length
329
+ del chunk, resampled_chunk
330
+
331
+ container.close()
332
+
333
+ if output_offset == 0:
334
+ raise ValueError(f"No audio data found in file: {audio}")
335
+
336
+ waveform = waveform[..., :output_offset]
337
+ return waveform
338
+
339
+ frames = []
340
+ for frame in container.decode(audio_stream):
341
+ array = frame.to_ndarray()
342
+ if array.ndim == 1:
343
+ array = array.reshape(-1, 1)
344
+ elif array.ndim == 2 and array.shape[0] < array.shape[1]:
345
+ array = array.T
346
+ frames.append(array)
347
+ container.close()
348
+
349
+ if not frames:
350
+ raise ValueError(f"No audio data found in file: {audio}")
351
+
352
+ waveform = np.concatenate(frames, axis=0).astype(np.float32)
353
+ del frames
354
+ result = self._resample_audio(
355
+ (waveform, sample_rate),
356
+ sampling_rate,
357
+ device=self.device,
358
+ channel_selector=channel_selector,
359
+ )
360
+ del waveform
361
+ return result
362
+ except Exception as exc:
363
+ raise RuntimeError(f"Failed to load audio file {audio}: {exc}")
385
364
 
386
365
  def __call__(
387
366
  self,
lattifai/cli/__init__.py CHANGED
@@ -4,7 +4,7 @@ import nemo_run as run # noqa: F401
4
4
 
5
5
  # Import and re-export entrypoints at package level so NeMo Run can find them
6
6
  from lattifai.cli.alignment import align
7
- from lattifai.cli.caption import convert
7
+ from lattifai.cli.caption import convert, diff
8
8
  from lattifai.cli.diarization import diarize
9
9
  from lattifai.cli.transcribe import transcribe, transcribe_align
10
10
  from lattifai.cli.youtube import youtube
@@ -12,6 +12,7 @@ from lattifai.cli.youtube import youtube
12
12
  __all__ = [
13
13
  "align",
14
14
  "convert",
15
+ "diff",
15
16
  "diarize",
16
17
  "transcribe",
17
18
  "transcribe_align",
lattifai/cli/alignment.py CHANGED
@@ -1,5 +1,6 @@
1
1
  """Alignment CLI entry point with nemo_run."""
2
2
 
3
+ import sys
3
4
  from typing import Optional
4
5
 
5
6
  import nemo_run as run
@@ -12,9 +13,11 @@ from lattifai.config import (
12
13
  CaptionConfig,
13
14
  ClientConfig,
14
15
  DiarizationConfig,
16
+ EventConfig,
15
17
  MediaConfig,
16
18
  TranscriptionConfig,
17
19
  )
20
+ from lattifai.errors import LattifAIError
18
21
 
19
22
  __all__ = ["align"]
20
23
 
@@ -30,6 +33,7 @@ def align(
30
33
  alignment: Annotated[Optional[AlignmentConfig], run.Config[AlignmentConfig]] = None,
31
34
  transcription: Annotated[Optional[TranscriptionConfig], run.Config[TranscriptionConfig]] = None,
32
35
  diarization: Annotated[Optional[DiarizationConfig], run.Config[DiarizationConfig]] = None,
36
+ event: Annotated[Optional[EventConfig], run.Config[EventConfig]] = None,
33
37
  ):
34
38
  """
35
39
  Align audio/video with caption file.
@@ -121,6 +125,7 @@ def align(
121
125
  caption_config=caption_config,
122
126
  transcription_config=transcription,
123
127
  diarization_config=diarization,
128
+ event_config=event,
124
129
  )
125
130
 
126
131
  is_url = media_config.input_path.startswith(("http://", "https://"))
lattifai/cli/caption.py CHANGED
@@ -6,6 +6,7 @@ import nemo_run as run
6
6
  from lhotse.utils import Pathlike
7
7
  from typing_extensions import Annotated
8
8
 
9
+ from lattifai.caption.config import KaraokeConfig
9
10
  from lattifai.config import CaptionConfig
10
11
  from lattifai.utils import safe_print
11
12
 
@@ -16,6 +17,8 @@ def convert(
16
17
  output_path: Pathlike,
17
18
  include_speaker_in_text: bool = False,
18
19
  normalize_text: bool = False,
20
+ word_level: bool = False,
21
+ karaoke: bool = False,
19
22
  ):
20
23
  """
21
24
  Convert caption file to another format.
@@ -33,6 +36,11 @@ def convert(
33
36
  normalize_text: Whether to normalize caption text during conversion.
34
37
  This applies text cleaning such as removing HTML tags, decoding entities,
35
38
  collapsing whitespace, and standardizing punctuation.
39
+ word_level: Use word-level output format if supported.
40
+ When True without karaoke: outputs word-per-segment (each word as separate segment).
41
+ JSON format will include a 'words' field with word-level timestamps.
42
+ karaoke: Enable karaoke styling (requires word_level=True).
43
+ When True: outputs karaoke format (ASS \\kf tags, enhanced LRC, etc.).
36
44
 
37
45
  Examples:
38
46
  # Basic format conversion (positional arguments)
@@ -41,6 +49,15 @@ def convert(
41
49
  # Convert with text normalization
42
50
  lai caption convert input.srt output.json normalize_text=true
43
51
 
52
+ # Convert to word-per-segment output (if input has alignment)
53
+ lai caption convert input.json output.srt word_level=true
54
+
55
+ # Convert to karaoke format (ASS with \\kf tags)
56
+ lai caption convert input.json output.ass word_level=true karaoke=true
57
+
58
+ # Export JSON with word-level timestamps
59
+ lai caption convert input.srt output.json word_level=true
60
+
44
61
  # Mixing positional and keyword arguments
45
62
  lai caption convert input.srt output.vtt \\
46
63
  include_speaker_in_text=false \\
@@ -51,10 +68,18 @@ def convert(
51
68
  input_path=input.srt \\
52
69
  output_path=output.TextGrid
53
70
  """
54
- from lattifai.caption import Caption
71
+ from lattifai.data import Caption
72
+
73
+ # Create karaoke_config if karaoke flag is set
74
+ karaoke_config = KaraokeConfig(enabled=True) if karaoke else None
55
75
 
56
76
  caption = Caption.read(input_path, normalize_text=normalize_text)
57
- caption.write(output_path, include_speaker_in_text=include_speaker_in_text)
77
+ caption.write(
78
+ output_path,
79
+ include_speaker_in_text=include_speaker_in_text,
80
+ word_level=word_level,
81
+ karaoke_config=karaoke_config,
82
+ )
58
83
 
59
84
  safe_print(f"✅ Converted {input_path} -> {output_path}")
60
85
  return output_path
@@ -96,7 +121,7 @@ def normalize(
96
121
  """
97
122
  from pathlib import Path
98
123
 
99
- from lattifai.caption import Caption
124
+ from lattifai.data import Caption
100
125
 
101
126
  input_path = Path(input_path).expanduser()
102
127
  output_path = Path(output_path).expanduser()
@@ -151,7 +176,7 @@ def shift(
151
176
  """
152
177
  from pathlib import Path
153
178
 
154
- from lattifai.caption import Caption
179
+ from lattifai.data import Caption
155
180
 
156
181
  input_path = Path(input_path).expanduser()
157
182
  output_path = Path(output_path).expanduser()
@@ -178,6 +203,88 @@ def shift(
178
203
  return output_path
179
204
 
180
205
 
206
+ @run.cli.entrypoint(name="diff", namespace="caption")
207
+ def diff(
208
+ ref_path: Pathlike,
209
+ hyp_path: Pathlike,
210
+ split_sentence: bool = True,
211
+ verbose: bool = True,
212
+ ):
213
+ """
214
+ Compare and align caption supervisions with transcription segments.
215
+
216
+ This command reads a reference caption file and a hypothesis file, then performs
217
+ text alignment to show how they match up. It's useful for comparing
218
+ original subtitles against ASR (Automatic Speech Recognition) results.
219
+
220
+ Args:
221
+ ref_path: Path to reference caption file (ground truth)
222
+ hyp_path: Path to hypothesis file (e.g., ASR results)
223
+ split_sentence: Enable sentence splitting before alignment (default: True)
224
+ verbose: Enable verbose output to show detailed alignment info (default: True)
225
+
226
+ Examples:
227
+ # Compare reference with hypothesis (positional arguments)
228
+ lai caption diff subtitles.srt transcription.json
229
+
230
+ # Disable sentence splitting
231
+ lai caption diff subtitles.srt transcription.json split_sentence=false
232
+
233
+ # Disable verbose output
234
+ lai caption diff subtitles.srt transcription.json verbose=false
235
+ """
236
+ from pathlib import Path
237
+
238
+ from lattifai.alignment.text_align import align_supervisions_and_transcription
239
+ from lattifai.caption import SentenceSplitter
240
+ from lattifai.data import Caption
241
+
242
+ ref_path = Path(ref_path).expanduser()
243
+ hyp_path = Path(hyp_path).expanduser()
244
+
245
+ # Read reference caption (supervisions)
246
+ caption_obj = Caption.read(ref_path)
247
+
248
+ # Read hypothesis
249
+ hyp_obj = Caption.read(hyp_path)
250
+
251
+ # Apply sentence splitting if enabled
252
+ if split_sentence:
253
+ splitter = SentenceSplitter(device="cpu", lazy_init=True)
254
+ caption_obj.supervisions = splitter.split_sentences(caption_obj.supervisions)
255
+ hyp_obj.supervisions = splitter.split_sentences(hyp_obj.supervisions)
256
+
257
+ # Set transcription on caption object
258
+ caption_obj.transcription = hyp_obj.supervisions
259
+
260
+ safe_print(f"📖 Reference: {len(caption_obj.supervisions)} segments from {ref_path}")
261
+ safe_print(f"🎤 Hypothesis: {len(caption_obj.transcription)} segments from {hyp_path}")
262
+ if split_sentence:
263
+ safe_print("✂️ Sentence splitting: enabled")
264
+ safe_print("")
265
+
266
+ # Perform alignment
267
+ results = align_supervisions_and_transcription(
268
+ caption=caption_obj,
269
+ verbose=verbose,
270
+ )
271
+
272
+ # # Print summary
273
+ # safe_print("")
274
+ # safe_print("=" * 72)
275
+ # safe_print(f"📊 Alignment Summary: {len(results)} groups")
276
+ # for idx, (sub_align, asr_align, quality, timestamp, typing) in enumerate(results):
277
+ # sub_count = len(sub_align) if sub_align else 0
278
+ # asr_count = len(asr_align) if asr_align else 0
279
+ # safe_print(f" Group {idx + 1}: ref={sub_count}, hyp={asr_count}, {quality.info}, typing={typing}")
280
+
281
+ return results
282
+
283
+
284
+ def main_diff():
285
+ run.cli.main(diff)
286
+
287
+
181
288
  def main_convert():
182
289
  run.cli.main(convert)
183
290
 
@@ -92,10 +92,6 @@ def transcribe(
92
92
  client_wrapper = SyncAPIClient(config=client_config)
93
93
  transcription_config.client_wrapper = client_wrapper
94
94
 
95
- # Initialize client wrapper to properly set client_wrapper
96
- client_wrapper = SyncAPIClient(config=client_config)
97
- transcription_config.client_wrapper = client_wrapper
98
-
99
95
  # Validate input is required
100
96
  if not input and not media_config.input_path:
101
97
  raise ValueError("Input is required. Provide input as positional argument or media.input_path.")
@@ -129,7 +125,7 @@ def transcribe(
129
125
  if is_url:
130
126
  # Download media first, then transcribe
131
127
  safe_print(colorful.cyan(" Downloading media from URL..."))
132
- from lattifai.workflow.youtube import YouTubeDownloader
128
+ from lattifai.youtube import YouTubeDownloader
133
129
 
134
130
  downloader = YouTubeDownloader()
135
131
  input_path = asyncio.run(
@@ -170,7 +166,7 @@ def transcribe(
170
166
  safe_print(colorful.cyan(f" Output: {final_output}"))
171
167
 
172
168
  # Write output
173
- transcriber.write(transcript, final_output, encoding="utf-8", cache_audio_events=False)
169
+ transcriber.write(transcript, final_output, encoding="utf-8", cache_event=False)
174
170
 
175
171
  safe_print(colorful.green(f"🎉 Transcription completed: {final_output}"))
176
172