lattifai 1.0.4__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. lattifai/__init__.py +10 -0
  2. lattifai/alignment/lattice1_aligner.py +64 -15
  3. lattifai/alignment/lattice1_worker.py +135 -50
  4. lattifai/alignment/segmenter.py +3 -2
  5. lattifai/alignment/tokenizer.py +14 -13
  6. lattifai/audio2.py +269 -70
  7. lattifai/caption/caption.py +213 -19
  8. lattifai/cli/__init__.py +2 -0
  9. lattifai/cli/alignment.py +2 -1
  10. lattifai/cli/app_installer.py +35 -33
  11. lattifai/cli/caption.py +9 -19
  12. lattifai/cli/diarization.py +108 -0
  13. lattifai/cli/server.py +3 -1
  14. lattifai/cli/transcribe.py +55 -38
  15. lattifai/cli/youtube.py +1 -0
  16. lattifai/client.py +42 -121
  17. lattifai/config/alignment.py +37 -2
  18. lattifai/config/caption.py +1 -1
  19. lattifai/config/media.py +23 -3
  20. lattifai/config/transcription.py +4 -0
  21. lattifai/diarization/lattifai.py +18 -7
  22. lattifai/errors.py +7 -3
  23. lattifai/mixin.py +45 -16
  24. lattifai/server/app.py +2 -1
  25. lattifai/transcription/__init__.py +1 -1
  26. lattifai/transcription/base.py +21 -2
  27. lattifai/transcription/gemini.py +127 -1
  28. lattifai/transcription/lattifai.py +30 -2
  29. lattifai/utils.py +96 -28
  30. lattifai/workflow/file_manager.py +15 -13
  31. lattifai/workflow/youtube.py +16 -1
  32. {lattifai-1.0.4.dist-info → lattifai-1.1.0.dist-info}/METADATA +86 -22
  33. lattifai-1.1.0.dist-info/RECORD +57 -0
  34. {lattifai-1.0.4.dist-info → lattifai-1.1.0.dist-info}/entry_points.txt +2 -0
  35. {lattifai-1.0.4.dist-info → lattifai-1.1.0.dist-info}/licenses/LICENSE +1 -1
  36. lattifai-1.0.4.dist-info/RECORD +0 -56
  37. {lattifai-1.0.4.dist-info → lattifai-1.1.0.dist-info}/WHEEL +0 -0
  38. {lattifai-1.0.4.dist-info → lattifai-1.1.0.dist-info}/top_level.txt +0 -0
lattifai/audio2.py CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  from collections import namedtuple
4
4
  from pathlib import Path
5
- from typing import BinaryIO, Iterable, Optional, Tuple, Union
5
+ from typing import BinaryIO, Optional, Tuple, Union
6
6
 
7
7
  import numpy as np
8
8
  import soundfile as sf
@@ -16,8 +16,14 @@ from lattifai.errors import AudioLoadError
16
16
  ChannelSelectorType = Union[int, str]
17
17
 
18
18
 
19
- class AudioData(namedtuple("AudioData", ["sampling_rate", "ndarray", "tensor", "device", "path"])):
20
- """Audio data container with sampling rate, numpy array, tensor, and device information."""
19
+ class AudioData(namedtuple("AudioData", ["sampling_rate", "ndarray", "path", "streaming_chunk_secs", "overlap_secs"])):
20
+ """Audio data container with sampling rate and numpy array.
21
+
22
+ Supports iteration to stream audio chunks for processing long audio files.
23
+ The streaming_chunk_secs field indicates whether streaming mode should be used downstream.
24
+ The overlap_secs field specifies the overlap duration between consecutive chunks.
25
+ Note: tensor field removed to reduce memory usage. Convert ndarray to tensor on-demand.
26
+ """
21
27
 
22
28
  def __str__(self) -> str:
23
29
  return self.path
@@ -27,6 +33,66 @@ class AudioData(namedtuple("AudioData", ["sampling_rate", "ndarray", "tensor", "
27
33
  """Duration of the audio in seconds."""
28
34
  return self.ndarray.shape[-1] / self.sampling_rate
29
35
 
36
+ @property
37
+ def streaming_mode(self) -> bool:
38
+ """Indicates whether streaming mode is enabled based on streaming_chunk_secs."""
39
+ if self.streaming_chunk_secs is not None:
40
+ return self.duration > self.streaming_chunk_secs * 1.1
41
+ return False
42
+
43
+ def __iter__(self):
44
+ """Initialize iterator for chunk-based audio streaming.
45
+
46
+ Returns an iterator that yields audio chunks as AudioData instances.
47
+ Uses streaming_chunk_secs and overlap_secs from the instance.
48
+ """
49
+ return self.iter_chunks()
50
+
51
+ def iter_chunks(
52
+ self,
53
+ chunk_secs: Optional[float] = None,
54
+ overlap_secs: Optional[float] = None,
55
+ ):
56
+ """Iterate over audio chunks with configurable duration and overlap.
57
+
58
+ Args:
59
+ chunk_secs: Duration of each chunk in seconds (default: uses streaming_chunk_secs or 600.0).
60
+ overlap_secs: Overlap between consecutive chunks in seconds (default: uses overlap_secs or 0.0).
61
+
62
+ Yields:
63
+ AudioData: Chunks of audio data.
64
+
65
+ Example:
66
+ >>> audio = loader("long_audio.wav")
67
+ >>> for chunk in audio.iter_chunks(chunk_secs=60.0, overlap_secs=2.0):
68
+ ... process(chunk)
69
+ """
70
+ chunk_duration = chunk_secs or self.streaming_chunk_secs or 600.0
71
+ overlap_duration = overlap_secs or self.overlap_secs or 0.0
72
+
73
+ chunk_size = int(chunk_duration * self.sampling_rate)
74
+ overlap_size = int(overlap_duration * self.sampling_rate)
75
+ step_size = chunk_size - overlap_size
76
+ total_samples = self.ndarray.shape[-1]
77
+
78
+ current_offset = 0
79
+ while current_offset < total_samples:
80
+ start = current_offset
81
+ end = min(start + chunk_size, total_samples)
82
+
83
+ # Extract chunk from ndarray only
84
+ chunk_ndarray = self.ndarray[..., start:end]
85
+
86
+ yield AudioData(
87
+ sampling_rate=self.sampling_rate,
88
+ ndarray=chunk_ndarray,
89
+ path=f"{self.path}[{start/self.sampling_rate:.2f}s-{end/self.sampling_rate:.2f}s]",
90
+ streaming_chunk_secs=None,
91
+ overlap_secs=None,
92
+ )
93
+
94
+ current_offset += step_size
95
+
30
96
 
31
97
  class AudioLoader:
32
98
  """Load and preprocess audio files into AudioData format."""
@@ -45,62 +111,48 @@ class AudioLoader:
45
111
 
46
112
  def _resample_audio(
47
113
  self,
48
- audio_sr: Tuple[torch.Tensor, int],
114
+ audio_sr: Tuple[np.ndarray, int],
49
115
  sampling_rate: int,
50
116
  device: Optional[str],
51
117
  channel_selector: Optional[ChannelSelectorType],
52
- ) -> torch.Tensor:
118
+ ) -> np.ndarray:
53
119
  """Resample audio to target sampling rate with channel selection.
54
120
 
55
121
  Args:
56
- audio_sr: Tuple of (audio_tensor, original_sample_rate).
122
+ audio_sr: Tuple of (audio, original_sample_rate).
57
123
  sampling_rate: Target sampling rate.
58
124
  device: Device to perform resampling on.
59
125
  channel_selector: How to select channels.
60
126
 
61
127
  Returns:
62
- Resampled audio tensor of shape (1, T) or (C, T).
128
+ Resampled audio array of shape (1, T) or (C, T).
63
129
  """
64
130
  audio, sr = audio_sr
65
131
 
66
132
  if channel_selector is None:
67
133
  # keep the original multi-channel signal
68
- tensor = audio
134
+ tensor = audio.T
135
+ del audio # Free original audio memory
69
136
  elif isinstance(channel_selector, int):
70
- assert audio.shape[0] >= channel_selector, f"Invalid channel: {channel_selector}"
71
- tensor = audio[channel_selector : channel_selector + 1].clone()
137
+ assert audio.shape[1] >= channel_selector, f"Invalid channel: {channel_selector}"
138
+ tensor = audio[:, channel_selector : channel_selector + 1].T.copy()
72
139
  del audio
73
140
  elif isinstance(channel_selector, str):
74
141
  assert channel_selector == "average"
75
- tensor = torch.mean(audio.to(device), dim=0, keepdim=True)
142
+ tensor = np.mean(audio, axis=1, keepdims=True).T
76
143
  del audio
77
144
  else:
78
145
  raise ValueError(f"Unsupported channel_selector: {channel_selector}")
79
- # assert isinstance(channel_selector, Iterable)
80
- # num_channels = audio.shape[0]
81
- # print(f"Selecting channels {channel_selector} from the signal with {num_channels} channels.")
82
- # if max(channel_selector) >= num_channels:
83
- # raise ValueError(
84
- # f"Cannot select channel subset {channel_selector} from a signal with {num_channels} channels."
85
- # )
86
- # tensor = audio[channel_selector]
87
-
88
- tensor = tensor.to(device)
146
+
147
+ # tensor: np.ndarray (channels, samples)
89
148
  if sr != sampling_rate:
90
149
  cache_key = (sr, sampling_rate, device)
91
150
  if cache_key not in self._resampler_cache:
92
151
  self._resampler_cache[cache_key] = get_or_create_resampler(sr, sampling_rate).to(device=device)
93
152
  resampler = self._resampler_cache[cache_key]
94
153
 
95
- length = tensor.size(-1)
96
- chunk_size = sampling_rate * 3600
97
- if length > chunk_size:
98
- resampled_chunks = []
99
- for i in range(0, length, chunk_size):
100
- resampled_chunks.append(resampler(tensor[..., i : i + chunk_size]))
101
- tensor = torch.cat(resampled_chunks, dim=-1)
102
- else:
103
- tensor = resampler(tensor)
154
+ tensor = resampler(torch.from_numpy(tensor).to(device=device))
155
+ tensor = tensor.cpu().numpy()
104
156
 
105
157
  return tensor
106
158
 
@@ -109,7 +161,7 @@ class AudioLoader:
109
161
  audio: Union[Pathlike, BinaryIO],
110
162
  sampling_rate: int,
111
163
  channel_selector: Optional[ChannelSelectorType],
112
- ) -> torch.Tensor:
164
+ ) -> np.ndarray:
113
165
  """Load audio from file or binary stream and resample to target rate.
114
166
 
115
167
  Args:
@@ -118,7 +170,7 @@ class AudioLoader:
118
170
  channel_selector: How to select channels.
119
171
 
120
172
  Returns:
121
- Resampled audio tensor.
173
+ Resampled audio as a NumPy array of shape (channels, samples).
122
174
 
123
175
  Raises:
124
176
  ImportError: If PyAV is needed but not installed.
@@ -128,11 +180,69 @@ class AudioLoader:
128
180
  if isinstance(audio, Pathlike):
129
181
  audio = str(Path(str(audio)).expanduser())
130
182
 
131
- # load audio
183
+ # load audio in chunks to reduce memory footprint for long files
132
184
  try:
133
- waveform, sample_rate = sf.read(audio, always_2d=True, dtype="float32") # numpy array
134
- waveform = waveform.T # (channels, samples)
185
+ # First check file duration to decide loading strategy
186
+ info = sf.info(audio)
187
+ duration = info.duration
188
+
189
+ # For very long audio (>60 minutes), use chunk-based loading
190
+ if duration > 3600: # 60 minutes
191
+ with sf.SoundFile(audio, "r") as f:
192
+ sample_rate = f.samplerate
193
+ total_frames = f.frames
194
+
195
+ # Pre-calculate output size to avoid list accumulation
196
+ num_channels = 1 if channel_selector else f.channels
197
+ expected_output_samples = int(total_frames * sampling_rate / sample_rate)
198
+
199
+ # Pre-allocate output array
200
+ waveform = np.zeros((num_channels, expected_output_samples), dtype=np.float32)
201
+
202
+ # Use source sample rate for reading, not target
203
+ chunk_frames = int(sample_rate * 1800) # 30-minute chunks at source rate
204
+ output_offset = 0
205
+
206
+ while True:
207
+ chunk = f.read(frames=chunk_frames, dtype="float32", always_2d=True)
208
+ if chunk.size == 0:
209
+ break
210
+
211
+ # Resample chunk -> (channels, samples)
212
+ resampled_chunk = self._resample_audio(
213
+ (chunk, sample_rate),
214
+ sampling_rate,
215
+ device=self.device,
216
+ channel_selector=channel_selector,
217
+ )
218
+
219
+ # Write directly to pre-allocated array
220
+ chunk_length = resampled_chunk.shape[-1]
221
+ waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
222
+ output_offset += chunk_length
223
+
224
+ # Clean up immediately
225
+ del chunk, resampled_chunk
226
+
227
+ # Trim to actual size if needed (due to rounding in resampling)
228
+ if output_offset < expected_output_samples:
229
+ waveform = waveform[..., :output_offset]
230
+
231
+ return waveform
232
+ else:
233
+ # For shorter audio, use standard loading
234
+ waveform, sample_rate = sf.read(audio, always_2d=True, dtype="float32")
235
+ # Resample and return directly to avoid double processing
236
+ result = self._resample_audio(
237
+ (waveform, sample_rate),
238
+ sampling_rate,
239
+ device=self.device,
240
+ channel_selector=channel_selector,
241
+ )
242
+ del waveform
243
+ return result
135
244
  except Exception as primary_error:
245
+ print(f"Primary error with soundfile: {primary_error}")
136
246
  # Fallback to PyAV for formats not supported by soundfile
137
247
  try:
138
248
  import av
@@ -150,62 +260,151 @@ class AudioLoader:
150
260
  if audio_stream is None:
151
261
  raise ValueError(f"No audio stream found in file: {audio}")
152
262
 
153
- # Resample to target sample rate during decoding
154
263
  audio_stream.codec_context.format = av.AudioFormat("flt") # 32-bit float
155
-
156
- frames = []
157
- for frame in container.decode(audio_stream):
158
- # Convert frame to numpy array
159
- array = frame.to_ndarray()
160
- # Ensure shape is (channels, samples)
161
- if array.ndim == 1:
162
- array = array.reshape(1, -1)
163
- elif array.ndim == 2 and array.shape[0] > array.shape[1]:
164
- array = array.T
165
- frames.append(array)
166
-
167
- container.close()
168
-
169
- if not frames:
170
- raise ValueError(f"No audio data found in file: {audio}")
171
-
172
- # Concatenate all frames
173
- waveform = np.concatenate(frames, axis=1).astype(np.float32) # (channels, samples)
174
264
  sample_rate = audio_stream.codec_context.sample_rate
265
+
266
+ # Estimate duration to decide processing strategy
267
+ duration_estimate = None
268
+ if audio_stream.duration and audio_stream.time_base:
269
+ duration_estimate = float(audio_stream.duration * audio_stream.time_base)
270
+ else:
271
+ print(f"WARNING: Failed to estimate duration for audio: {audio}")
272
+
273
+ # For very long audio (>30 minutes), process and resample in chunks
274
+ if duration_estimate and duration_estimate > 1800:
275
+ # Estimate output size and pre-allocate with buffer
276
+ num_channels = 1 if channel_selector else audio_stream.codec_context.channels
277
+ estimated_samples = int(duration_estimate * sampling_rate * 1.1) # 10% buffer
278
+ waveform = np.zeros((num_channels, estimated_samples), dtype=np.float32)
279
+
280
+ frames = []
281
+ accumulated_samples = 0
282
+ output_offset = 0
283
+ chunk_sample_target = int(sample_rate * 600) # 10 minutes at original rate
284
+
285
+ for frame in container.decode(audio_stream):
286
+ array = frame.to_ndarray()
287
+
288
+ # Ensure shape is (samples, channels)
289
+ if array.ndim == 1:
290
+ array = array.reshape(-1, 1)
291
+ elif array.ndim == 2 and array.shape[0] < array.shape[1]:
292
+ array = array.T
293
+
294
+ frames.append(array)
295
+ accumulated_samples += array.shape[0]
296
+
297
+ # Process chunk when accumulated enough samples
298
+ if accumulated_samples >= chunk_sample_target:
299
+ chunk = np.concatenate(frames, axis=0).astype(np.float32)
300
+ del frames # Free frames list before resampling
301
+ # Resample chunk -> (channels, samples)
302
+ resampled_chunk = self._resample_audio(
303
+ (chunk, sample_rate),
304
+ sampling_rate,
305
+ device=self.device,
306
+ channel_selector=channel_selector,
307
+ )
308
+
309
+ chunk_length = resampled_chunk.shape[-1]
310
+ if output_offset + chunk_length > waveform.shape[-1]:
311
+ print(
312
+ f"WARNING: Trimming resampled chunk from {chunk_length} to {waveform.shape[-1] - output_offset} samples to fit waveform buffer for audio: {audio}" # noqa: E501
313
+ )
314
+ resampled_chunk = resampled_chunk[:, : waveform.shape[-1] - output_offset]
315
+
316
+ # Write directly to array
317
+ waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
318
+ output_offset += chunk_length
319
+
320
+ # Clean up immediately
321
+ del chunk, resampled_chunk
322
+ frames = [] # Create new list
323
+ accumulated_samples = 0
324
+
325
+ # Process remaining frames
326
+ if frames:
327
+ chunk = np.concatenate(frames, axis=0).astype(np.float32)
328
+ del frames
329
+ resampled_chunk = self._resample_audio(
330
+ (chunk, sample_rate),
331
+ sampling_rate,
332
+ device=self.device,
333
+ channel_selector=channel_selector,
334
+ )
335
+
336
+ chunk_length = resampled_chunk.shape[-1]
337
+ if output_offset + chunk_length > waveform.shape[-1]:
338
+ print(
339
+ f"WARNING: Trimming resampled chunk from {chunk_length} to {waveform.shape[-1] - output_offset} samples to fit waveform buffer for audio: {audio}" # noqa: E501
340
+ )
341
+ resampled_chunk = resampled_chunk[:, : waveform.shape[-1] - output_offset]
342
+
343
+ waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
344
+ output_offset += chunk_length
345
+ del chunk, resampled_chunk
346
+
347
+ container.close()
348
+
349
+ if output_offset == 0:
350
+ raise ValueError(f"No audio data found in file: {audio}")
351
+
352
+ # Trim to actual size
353
+ waveform = waveform[..., :output_offset]
354
+ return waveform
355
+ else:
356
+ # For shorter audio, process in batches to reduce memory
357
+ frames = []
358
+ for frame in container.decode(audio_stream):
359
+ array = frame.to_ndarray()
360
+ # Ensure shape is (channels, samples)
361
+ if array.ndim == 1:
362
+ array = array.reshape(-1, 1)
363
+ elif array.ndim == 2 and array.shape[0] < array.shape[1]:
364
+ array = array.T
365
+ frames.append(array)
366
+ container.close()
367
+
368
+ if not frames:
369
+ raise ValueError(f"No audio data found in file: {audio}")
370
+
371
+ # Concatenate remaining frames
372
+ waveform = np.concatenate(frames, axis=0).astype(np.float32)
373
+ del frames
374
+ # Resample and return directly
375
+ result = self._resample_audio(
376
+ (waveform, sample_rate),
377
+ sampling_rate,
378
+ device=self.device,
379
+ channel_selector=channel_selector,
380
+ )
381
+ del waveform
382
+ return result
175
383
  except Exception as e:
176
384
  raise RuntimeError(f"Failed to load audio file {audio}: {e}")
177
385
 
178
- return self._resample_audio(
179
- (torch.from_numpy(waveform), sample_rate),
180
- sampling_rate,
181
- device=self.device,
182
- channel_selector=channel_selector,
183
- )
184
-
185
386
  def __call__(
186
387
  self,
187
388
  audio: Union[Pathlike, BinaryIO],
188
389
  sampling_rate: int = 16000,
189
390
  channel_selector: Optional[ChannelSelectorType] = "average",
391
+ streaming_chunk_secs: Optional[float] = None,
190
392
  ) -> AudioData:
191
393
  """
192
394
  Args:
193
395
  audio: Path to audio file or binary stream.
194
396
  channel_selector: How to select channels (default: "average").
195
397
  sampling_rate: Target sampling rate (default: use instance sampling_rate).
398
+ streaming_chunk_secs: Duration in seconds for streaming chunks (default: None, disabled).
196
399
 
197
400
  Returns:
198
- AudioData namedtuple with sampling_rate, ndarray, and tensor fields.
401
+ AudioData namedtuple with sampling_rate, ndarray, and streaming_chunk_secs fields.
199
402
  """
200
- tensor = self._load_audio(audio, sampling_rate, channel_selector)
201
-
202
- # tensor is (1, T) or (C, T)
203
- ndarray = tensor.cpu().numpy()
204
-
403
+ ndarray = self._load_audio(audio, sampling_rate, channel_selector)
205
404
  return AudioData(
206
405
  sampling_rate=sampling_rate,
207
406
  ndarray=ndarray,
208
- tensor=tensor,
209
- device=self.device,
210
407
  path=str(audio) if isinstance(audio, Pathlike) else "<BinaryIO>",
408
+ streaming_chunk_secs=streaming_chunk_secs,
409
+ overlap_secs=0.0,
211
410
  )