lattifai 1.0.4__py3-none-any.whl → 1.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lattifai/audio2.py CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  from collections import namedtuple
4
4
  from pathlib import Path
5
- from typing import BinaryIO, Iterable, Optional, Tuple, Union
5
+ from typing import BinaryIO, Optional, Tuple, Union
6
6
 
7
7
  import numpy as np
8
8
  import soundfile as sf
@@ -16,8 +16,14 @@ from lattifai.errors import AudioLoadError
16
16
  ChannelSelectorType = Union[int, str]
17
17
 
18
18
 
19
- class AudioData(namedtuple("AudioData", ["sampling_rate", "ndarray", "tensor", "device", "path"])):
20
- """Audio data container with sampling rate, numpy array, tensor, and device information."""
19
+ class AudioData(namedtuple("AudioData", ["sampling_rate", "ndarray", "path", "streaming_chunk_secs", "overlap_secs"])):
20
+ """Audio data container with sampling rate and numpy array.
21
+
22
+ Supports iteration to stream audio chunks for processing long audio files.
23
+ The streaming_chunk_secs field indicates whether streaming mode should be used downstream.
24
+ The overlap_secs field specifies the overlap duration between consecutive chunks.
25
+ Note: tensor field removed to reduce memory usage. Convert ndarray to tensor on-demand.
26
+ """
21
27
 
22
28
  def __str__(self) -> str:
23
29
  return self.path
@@ -27,6 +33,66 @@ class AudioData(namedtuple("AudioData", ["sampling_rate", "ndarray", "tensor", "
27
33
  """Duration of the audio in seconds."""
28
34
  return self.ndarray.shape[-1] / self.sampling_rate
29
35
 
36
+ @property
37
+ def streaming_mode(self) -> bool:
38
+ """Indicates whether streaming mode is enabled based on streaming_chunk_secs."""
39
+ if self.streaming_chunk_secs is not None:
40
+ return self.duration > self.streaming_chunk_secs * 1.1
41
+ return False
42
+
43
+ def __iter__(self):
44
+ """Initialize iterator for chunk-based audio streaming.
45
+
46
+ Returns an iterator that yields audio chunks as AudioData instances.
47
+ Uses streaming_chunk_secs and overlap_secs from the instance.
48
+ """
49
+ return self.iter_chunks()
50
+
51
+ def iter_chunks(
52
+ self,
53
+ chunk_secs: Optional[float] = None,
54
+ overlap_secs: Optional[float] = None,
55
+ ):
56
+ """Iterate over audio chunks with configurable duration and overlap.
57
+
58
+ Args:
59
+ chunk_secs: Duration of each chunk in seconds (default: uses streaming_chunk_secs or 600.0).
60
+ overlap_secs: Overlap between consecutive chunks in seconds (default: uses overlap_secs or 0.0).
61
+
62
+ Yields:
63
+ AudioData: Chunks of audio data.
64
+
65
+ Example:
66
+ >>> audio = loader("long_audio.wav")
67
+ >>> for chunk in audio.iter_chunks(chunk_secs=60.0, overlap_secs=2.0):
68
+ ... process(chunk)
69
+ """
70
+ chunk_duration = chunk_secs or self.streaming_chunk_secs or 600.0
71
+ overlap_duration = overlap_secs or self.overlap_secs or 0.0
72
+
73
+ chunk_size = int(chunk_duration * self.sampling_rate)
74
+ overlap_size = int(overlap_duration * self.sampling_rate)
75
+ step_size = chunk_size - overlap_size
76
+ total_samples = self.ndarray.shape[-1]
77
+
78
+ current_offset = 0
79
+ while current_offset < total_samples:
80
+ start = current_offset
81
+ end = min(start + chunk_size, total_samples)
82
+
83
+ # Extract chunk from ndarray only
84
+ chunk_ndarray = self.ndarray[..., start:end]
85
+
86
+ yield AudioData(
87
+ sampling_rate=self.sampling_rate,
88
+ ndarray=chunk_ndarray,
89
+ path=f"{self.path}[{start/self.sampling_rate:.2f}s-{end/self.sampling_rate:.2f}s]",
90
+ streaming_chunk_secs=None,
91
+ overlap_secs=None,
92
+ )
93
+
94
+ current_offset += step_size
95
+
30
96
 
31
97
  class AudioLoader:
32
98
  """Load and preprocess audio files into AudioData format."""
@@ -45,62 +111,48 @@ class AudioLoader:
45
111
 
46
112
  def _resample_audio(
47
113
  self,
48
- audio_sr: Tuple[torch.Tensor, int],
114
+ audio_sr: Tuple[np.ndarray, int],
49
115
  sampling_rate: int,
50
116
  device: Optional[str],
51
117
  channel_selector: Optional[ChannelSelectorType],
52
- ) -> torch.Tensor:
118
+ ) -> np.ndarray:
53
119
  """Resample audio to target sampling rate with channel selection.
54
120
 
55
121
  Args:
56
- audio_sr: Tuple of (audio_tensor, original_sample_rate).
122
+ audio_sr: Tuple of (audio, original_sample_rate).
57
123
  sampling_rate: Target sampling rate.
58
124
  device: Device to perform resampling on.
59
125
  channel_selector: How to select channels.
60
126
 
61
127
  Returns:
62
- Resampled audio tensor of shape (1, T) or (C, T).
128
+ Resampled audio array of shape (1, T) or (C, T).
63
129
  """
64
130
  audio, sr = audio_sr
65
131
 
66
132
  if channel_selector is None:
67
133
  # keep the original multi-channel signal
68
- tensor = audio
134
+ tensor = audio.T
135
+ del audio # Free original audio memory
69
136
  elif isinstance(channel_selector, int):
70
- assert audio.shape[0] >= channel_selector, f"Invalid channel: {channel_selector}"
71
- tensor = audio[channel_selector : channel_selector + 1].clone()
137
+ assert audio.shape[1] >= channel_selector, f"Invalid channel: {channel_selector}"
138
+ tensor = audio[:, channel_selector : channel_selector + 1].T.copy()
72
139
  del audio
73
140
  elif isinstance(channel_selector, str):
74
141
  assert channel_selector == "average"
75
- tensor = torch.mean(audio.to(device), dim=0, keepdim=True)
142
+ tensor = np.mean(audio, axis=1, keepdims=True).T
76
143
  del audio
77
144
  else:
78
145
  raise ValueError(f"Unsupported channel_selector: {channel_selector}")
79
- # assert isinstance(channel_selector, Iterable)
80
- # num_channels = audio.shape[0]
81
- # print(f"Selecting channels {channel_selector} from the signal with {num_channels} channels.")
82
- # if max(channel_selector) >= num_channels:
83
- # raise ValueError(
84
- # f"Cannot select channel subset {channel_selector} from a signal with {num_channels} channels."
85
- # )
86
- # tensor = audio[channel_selector]
87
-
88
- tensor = tensor.to(device)
146
+
147
+ # tensor: np.ndarray (channels, samples)
89
148
  if sr != sampling_rate:
90
149
  cache_key = (sr, sampling_rate, device)
91
150
  if cache_key not in self._resampler_cache:
92
151
  self._resampler_cache[cache_key] = get_or_create_resampler(sr, sampling_rate).to(device=device)
93
152
  resampler = self._resampler_cache[cache_key]
94
153
 
95
- length = tensor.size(-1)
96
- chunk_size = sampling_rate * 3600
97
- if length > chunk_size:
98
- resampled_chunks = []
99
- for i in range(0, length, chunk_size):
100
- resampled_chunks.append(resampler(tensor[..., i : i + chunk_size]))
101
- tensor = torch.cat(resampled_chunks, dim=-1)
102
- else:
103
- tensor = resampler(tensor)
154
+ tensor = resampler(torch.from_numpy(tensor).to(device=device))
155
+ tensor = tensor.cpu().numpy()
104
156
 
105
157
  return tensor
106
158
 
@@ -109,7 +161,7 @@ class AudioLoader:
109
161
  audio: Union[Pathlike, BinaryIO],
110
162
  sampling_rate: int,
111
163
  channel_selector: Optional[ChannelSelectorType],
112
- ) -> torch.Tensor:
164
+ ) -> np.ndarray:
113
165
  """Load audio from file or binary stream and resample to target rate.
114
166
 
115
167
  Args:
@@ -118,7 +170,7 @@ class AudioLoader:
118
170
  channel_selector: How to select channels.
119
171
 
120
172
  Returns:
121
- Resampled audio tensor.
173
+ Resampled audio as a NumPy array of shape (channels, samples).
122
174
 
123
175
  Raises:
124
176
  ImportError: If PyAV is needed but not installed.
@@ -128,11 +180,69 @@ class AudioLoader:
128
180
  if isinstance(audio, Pathlike):
129
181
  audio = str(Path(str(audio)).expanduser())
130
182
 
131
- # load audio
183
+ # load audio in chunks to reduce memory footprint for long files
132
184
  try:
133
- waveform, sample_rate = sf.read(audio, always_2d=True, dtype="float32") # numpy array
134
- waveform = waveform.T # (channels, samples)
185
+ # First check file duration to decide loading strategy
186
+ info = sf.info(audio)
187
+ duration = info.duration
188
+
189
+ # For very long audio (>60 minutes), use chunk-based loading
190
+ if duration > 3600: # 60 minutes
191
+ with sf.SoundFile(audio, "r") as f:
192
+ sample_rate = f.samplerate
193
+ total_frames = f.frames
194
+
195
+ # Pre-calculate output size to avoid list accumulation
196
+ num_channels = 1 if channel_selector else f.channels
197
+ expected_output_samples = int(total_frames * sampling_rate / sample_rate)
198
+
199
+ # Pre-allocate output array
200
+ waveform = np.zeros((num_channels, expected_output_samples), dtype=np.float32)
201
+
202
+ # Use source sample rate for reading, not target
203
+ chunk_frames = int(sample_rate * 1800) # 30-minute chunks at source rate
204
+ output_offset = 0
205
+
206
+ while True:
207
+ chunk = f.read(frames=chunk_frames, dtype="float32", always_2d=True)
208
+ if chunk.size == 0:
209
+ break
210
+
211
+ # Resample chunk -> (channels, samples)
212
+ resampled_chunk = self._resample_audio(
213
+ (chunk, sample_rate),
214
+ sampling_rate,
215
+ device=self.device,
216
+ channel_selector=channel_selector,
217
+ )
218
+
219
+ # Write directly to pre-allocated array
220
+ chunk_length = resampled_chunk.shape[-1]
221
+ waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
222
+ output_offset += chunk_length
223
+
224
+ # Clean up immediately
225
+ del chunk, resampled_chunk
226
+
227
+ # Trim to actual size if needed (due to rounding in resampling)
228
+ if output_offset < expected_output_samples:
229
+ waveform = waveform[..., :output_offset]
230
+
231
+ return waveform
232
+ else:
233
+ # For shorter audio, use standard loading
234
+ waveform, sample_rate = sf.read(audio, always_2d=True, dtype="float32")
235
+ # Resample and return directly to avoid double processing
236
+ result = self._resample_audio(
237
+ (waveform, sample_rate),
238
+ sampling_rate,
239
+ device=self.device,
240
+ channel_selector=channel_selector,
241
+ )
242
+ del waveform
243
+ return result
135
244
  except Exception as primary_error:
245
+ print(f"Primary error with soundfile: {primary_error}")
136
246
  # Fallback to PyAV for formats not supported by soundfile
137
247
  try:
138
248
  import av
@@ -150,62 +260,151 @@ class AudioLoader:
150
260
  if audio_stream is None:
151
261
  raise ValueError(f"No audio stream found in file: {audio}")
152
262
 
153
- # Resample to target sample rate during decoding
154
263
  audio_stream.codec_context.format = av.AudioFormat("flt") # 32-bit float
155
-
156
- frames = []
157
- for frame in container.decode(audio_stream):
158
- # Convert frame to numpy array
159
- array = frame.to_ndarray()
160
- # Ensure shape is (channels, samples)
161
- if array.ndim == 1:
162
- array = array.reshape(1, -1)
163
- elif array.ndim == 2 and array.shape[0] > array.shape[1]:
164
- array = array.T
165
- frames.append(array)
166
-
167
- container.close()
168
-
169
- if not frames:
170
- raise ValueError(f"No audio data found in file: {audio}")
171
-
172
- # Concatenate all frames
173
- waveform = np.concatenate(frames, axis=1).astype(np.float32) # (channels, samples)
174
264
  sample_rate = audio_stream.codec_context.sample_rate
265
+
266
+ # Estimate duration to decide processing strategy
267
+ duration_estimate = None
268
+ if audio_stream.duration and audio_stream.time_base:
269
+ duration_estimate = float(audio_stream.duration * audio_stream.time_base)
270
+ else:
271
+ print(f"WARNING: Failed to estimate duration for audio: {audio}")
272
+
273
+ # For very long audio (>30 minutes), process and resample in chunks
274
+ if duration_estimate and duration_estimate > 1800:
275
+ # Estimate output size and pre-allocate with buffer
276
+ num_channels = 1 if channel_selector else audio_stream.codec_context.channels
277
+ estimated_samples = int(duration_estimate * sampling_rate * 1.1) # 10% buffer
278
+ waveform = np.zeros((num_channels, estimated_samples), dtype=np.float32)
279
+
280
+ frames = []
281
+ accumulated_samples = 0
282
+ output_offset = 0
283
+ chunk_sample_target = int(sample_rate * 600) # 10 minutes at original rate
284
+
285
+ for frame in container.decode(audio_stream):
286
+ array = frame.to_ndarray()
287
+
288
+ # Ensure shape is (samples, channels)
289
+ if array.ndim == 1:
290
+ array = array.reshape(-1, 1)
291
+ elif array.ndim == 2 and array.shape[0] < array.shape[1]:
292
+ array = array.T
293
+
294
+ frames.append(array)
295
+ accumulated_samples += array.shape[0]
296
+
297
+ # Process chunk when accumulated enough samples
298
+ if accumulated_samples >= chunk_sample_target:
299
+ chunk = np.concatenate(frames, axis=0).astype(np.float32)
300
+ del frames # Free frames list before resampling
301
+ # Resample chunk -> (channels, samples)
302
+ resampled_chunk = self._resample_audio(
303
+ (chunk, sample_rate),
304
+ sampling_rate,
305
+ device=self.device,
306
+ channel_selector=channel_selector,
307
+ )
308
+
309
+ chunk_length = resampled_chunk.shape[-1]
310
+ if output_offset + chunk_length > waveform.shape[-1]:
311
+ print(
312
+ f"WARNING: Trimming resampled chunk from {chunk_length} to {waveform.shape[-1] - output_offset} samples to fit waveform buffer for audio: {audio}" # noqa: E501
313
+ )
314
+ resampled_chunk = resampled_chunk[:, : waveform.shape[-1] - output_offset]
315
+
316
+ # Write directly to array
317
+ waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
318
+ output_offset += chunk_length
319
+
320
+ # Clean up immediately
321
+ del chunk, resampled_chunk
322
+ frames = [] # Create new list
323
+ accumulated_samples = 0
324
+
325
+ # Process remaining frames
326
+ if frames:
327
+ chunk = np.concatenate(frames, axis=0).astype(np.float32)
328
+ del frames
329
+ resampled_chunk = self._resample_audio(
330
+ (chunk, sample_rate),
331
+ sampling_rate,
332
+ device=self.device,
333
+ channel_selector=channel_selector,
334
+ )
335
+
336
+ chunk_length = resampled_chunk.shape[-1]
337
+ if output_offset + chunk_length > waveform.shape[-1]:
338
+ print(
339
+ f"WARNING: Trimming resampled chunk from {chunk_length} to {waveform.shape[-1] - output_offset} samples to fit waveform buffer for audio: {audio}" # noqa: E501
340
+ )
341
+ resampled_chunk = resampled_chunk[:, : waveform.shape[-1] - output_offset]
342
+
343
+ waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
344
+ output_offset += chunk_length
345
+ del chunk, resampled_chunk
346
+
347
+ container.close()
348
+
349
+ if output_offset == 0:
350
+ raise ValueError(f"No audio data found in file: {audio}")
351
+
352
+ # Trim to actual size
353
+ waveform = waveform[..., :output_offset]
354
+ return waveform
355
+ else:
356
+ # For shorter audio, process in batches to reduce memory
357
+ frames = []
358
+ for frame in container.decode(audio_stream):
359
+ array = frame.to_ndarray()
360
+ # Ensure shape is (channels, samples)
361
+ if array.ndim == 1:
362
+ array = array.reshape(-1, 1)
363
+ elif array.ndim == 2 and array.shape[0] < array.shape[1]:
364
+ array = array.T
365
+ frames.append(array)
366
+ container.close()
367
+
368
+ if not frames:
369
+ raise ValueError(f"No audio data found in file: {audio}")
370
+
371
+ # Concatenate remaining frames
372
+ waveform = np.concatenate(frames, axis=0).astype(np.float32)
373
+ del frames
374
+ # Resample and return directly
375
+ result = self._resample_audio(
376
+ (waveform, sample_rate),
377
+ sampling_rate,
378
+ device=self.device,
379
+ channel_selector=channel_selector,
380
+ )
381
+ del waveform
382
+ return result
175
383
  except Exception as e:
176
384
  raise RuntimeError(f"Failed to load audio file {audio}: {e}")
177
385
 
178
- return self._resample_audio(
179
- (torch.from_numpy(waveform), sample_rate),
180
- sampling_rate,
181
- device=self.device,
182
- channel_selector=channel_selector,
183
- )
184
-
185
386
  def __call__(
186
387
  self,
187
388
  audio: Union[Pathlike, BinaryIO],
188
389
  sampling_rate: int = 16000,
189
390
  channel_selector: Optional[ChannelSelectorType] = "average",
391
+ streaming_chunk_secs: Optional[float] = None,
190
392
  ) -> AudioData:
191
393
  """
192
394
  Args:
193
395
  audio: Path to audio file or binary stream.
194
396
  channel_selector: How to select channels (default: "average").
195
397
  sampling_rate: Target sampling rate (default: use instance sampling_rate).
398
+ streaming_chunk_secs: Duration in seconds for streaming chunks (default: None, disabled).
196
399
 
197
400
  Returns:
198
- AudioData namedtuple with sampling_rate, ndarray, and tensor fields.
401
+ AudioData namedtuple with sampling_rate, ndarray, and streaming_chunk_secs fields.
199
402
  """
200
- tensor = self._load_audio(audio, sampling_rate, channel_selector)
201
-
202
- # tensor is (1, T) or (C, T)
203
- ndarray = tensor.cpu().numpy()
204
-
403
+ ndarray = self._load_audio(audio, sampling_rate, channel_selector)
205
404
  return AudioData(
206
405
  sampling_rate=sampling_rate,
207
406
  ndarray=ndarray,
208
- tensor=tensor,
209
- device=self.device,
210
407
  path=str(audio) if isinstance(audio, Pathlike) else "<BinaryIO>",
408
+ streaming_chunk_secs=streaming_chunk_secs,
409
+ overlap_secs=0.0,
211
410
  )
@@ -307,7 +307,7 @@ class Caption:
307
307
  cls,
308
308
  path: Pathlike,
309
309
  format: Optional[str] = None,
310
- normalize_text: bool = False,
310
+ normalize_text: bool = True,
311
311
  ) -> "Caption":
312
312
  """
313
313
  Read caption file and return Caption object.
@@ -505,6 +505,8 @@ class Caption:
505
505
  cls._write_csv(alignments, output_path, include_speaker_in_text)
506
506
  elif str(output_path)[-4:].lower() == ".aud":
507
507
  cls._write_aud(alignments, output_path, include_speaker_in_text)
508
+ elif str(output_path)[-4:].lower() == ".sbv":
509
+ cls._write_sbv(alignments, output_path, include_speaker_in_text)
508
510
  else:
509
511
  import pysubs2
510
512
 
@@ -535,7 +537,14 @@ class Caption:
535
537
  name=sup.speaker or "",
536
538
  )
537
539
  )
538
- subs.save(output_path)
540
+
541
+ # MicroDVD format requires framerate to be specified
542
+ output_ext = str(output_path).lower().split(".")[-1]
543
+ if output_ext == "sub":
544
+ # Default to 25 fps for MicroDVD format if not specified
545
+ subs.save(output_path, fps=25.0)
546
+ else:
547
+ subs.save(output_path)
539
548
 
540
549
  return output_path
541
550
 
@@ -821,7 +830,7 @@ class Caption:
821
830
  if cls._is_youtube_vtt_with_word_timestamps(content):
822
831
  return cls._parse_youtube_vtt_with_word_timestamps(content, normalize_text)
823
832
 
824
- if format == "gemini" or str(caption).endswith("Gemini.md"):
833
+ if format == "gemini" or str(caption).endswith("Gemini.md") or str(caption).endswith("Gemini3.md"):
825
834
  from .gemini_reader import GeminiReader
826
835
 
827
836
  supervisions = GeminiReader.extract_for_alignment(caption)
@@ -850,6 +859,8 @@ class Caption:
850
859
  supervisions = cls._parse_csv(caption, normalize_text)
851
860
  elif format == "aud" or str(caption)[-4:].lower() == ".aud":
852
861
  supervisions = cls._parse_aud(caption, normalize_text)
862
+ elif format == "sbv" or str(caption)[-4:].lower() == ".sbv":
863
+ supervisions = cls._parse_sbv(caption, normalize_text)
853
864
  elif format == "txt" or (format == "auto" and str(caption)[-4:].lower() == ".txt"):
854
865
  if not Path(str(caption)).exists(): # str
855
866
  lines = [line.strip() for line in str(caption).split("\n")]
@@ -1113,6 +1124,101 @@ class Caption:
1113
1124
 
1114
1125
  return supervisions
1115
1126
 
1127
+ @classmethod
1128
+ def _parse_sbv(cls, caption: Pathlike, normalize_text: Optional[bool] = False) -> List[Supervision]:
1129
+ """
1130
+ Parse SubViewer (SBV) format caption file.
1131
+
1132
+ Format:
1133
+ 0:00:00.000,0:00:02.000
1134
+ Text line 1
1135
+
1136
+ 0:00:02.000,0:00:04.000
1137
+ Text line 2
1138
+
1139
+ Args:
1140
+ caption: Caption file path
1141
+ normalize_text: Whether to normalize text
1142
+
1143
+ Returns:
1144
+ List of Supervision objects
1145
+ """
1146
+ caption_path = Path(str(caption))
1147
+ if not caption_path.exists():
1148
+ raise FileNotFoundError(f"Caption file not found: {caption}")
1149
+
1150
+ supervisions = []
1151
+
1152
+ with open(caption_path, "r", encoding="utf-8") as f:
1153
+ content = f.read()
1154
+
1155
+ # Split by double newlines to separate entries
1156
+ entries = content.strip().split("\n\n")
1157
+
1158
+ for entry in entries:
1159
+ lines = entry.strip().split("\n")
1160
+ if len(lines) < 2:
1161
+ continue
1162
+
1163
+ # First line: timestamp (H:MM:SS.mmm,H:MM:SS.mmm)
1164
+ timestamp_line = lines[0].strip()
1165
+ # Remaining lines: text
1166
+ text_lines = lines[1:]
1167
+
1168
+ try:
1169
+ # Parse timestamp: 0:00:00.000,0:00:02.000
1170
+ if "," not in timestamp_line:
1171
+ continue
1172
+
1173
+ start_str, end_str = timestamp_line.split(",", 1)
1174
+
1175
+ # Parse start time
1176
+ start_parts = start_str.strip().split(":")
1177
+ if len(start_parts) == 3:
1178
+ h, m, s = start_parts
1179
+ s_parts = s.split(".")
1180
+ start = int(h) * 3600 + int(m) * 60 + int(s_parts[0])
1181
+ if len(s_parts) > 1:
1182
+ start += int(s_parts[1]) / 1000.0
1183
+ else:
1184
+ continue
1185
+
1186
+ # Parse end time
1187
+ end_parts = end_str.strip().split(":")
1188
+ if len(end_parts) == 3:
1189
+ h, m, s = end_parts
1190
+ s_parts = s.split(".")
1191
+ end = int(h) * 3600 + int(m) * 60 + int(s_parts[0])
1192
+ if len(s_parts) > 1:
1193
+ end += int(s_parts[1]) / 1000.0
1194
+ else:
1195
+ continue
1196
+
1197
+ # Parse text and speaker
1198
+ text = " ".join(text_lines).strip()
1199
+ speaker, text = parse_speaker_text(text)
1200
+
1201
+ if normalize_text:
1202
+ text = normalize_text_fn(text)
1203
+
1204
+ duration = end - start
1205
+ if duration < 0:
1206
+ continue
1207
+
1208
+ supervisions.append(
1209
+ Supervision(
1210
+ text=text,
1211
+ start=start,
1212
+ duration=duration,
1213
+ speaker=speaker,
1214
+ )
1215
+ )
1216
+ except (ValueError, IndexError):
1217
+ # Skip malformed entries
1218
+ continue
1219
+
1220
+ return supervisions
1221
+
1116
1222
  @classmethod
1117
1223
  def _write_tsv(
1118
1224
  cls,
@@ -1217,6 +1323,58 @@ class Caption:
1217
1323
 
1218
1324
  file.write(f"{start}\t{end}\t{text}\n")
1219
1325
 
1326
+ @classmethod
1327
+ def _write_sbv(
1328
+ cls,
1329
+ alignments: List[Supervision],
1330
+ output_path: Pathlike,
1331
+ include_speaker_in_text: bool = True,
1332
+ ) -> None:
1333
+ """
1334
+ Write caption to SubViewer (SBV) format.
1335
+
1336
+ Format:
1337
+ 0:00:00.000,0:00:02.000
1338
+ Text line 1
1339
+
1340
+ 0:00:02.000,0:00:04.000
1341
+ Text line 2
1342
+
1343
+ Args:
1344
+ alignments: List of supervision segments to write
1345
+ output_path: Path to output SBV file
1346
+ include_speaker_in_text: Whether to include speaker in text
1347
+ """
1348
+ with open(output_path, "w", encoding="utf-8") as file:
1349
+ for i, supervision in enumerate(alignments):
1350
+ # Format timestamps as H:MM:SS.mmm
1351
+ start_h = int(supervision.start // 3600)
1352
+ start_m = int((supervision.start % 3600) // 60)
1353
+ start_s = int(supervision.start % 60)
1354
+ start_ms = int((supervision.start % 1) * 1000)
1355
+
1356
+ end_h = int(supervision.end // 3600)
1357
+ end_m = int((supervision.end % 3600) // 60)
1358
+ end_s = int(supervision.end % 60)
1359
+ end_ms = int((supervision.end % 1) * 1000)
1360
+
1361
+ start_time = f"{start_h}:{start_m:02d}:{start_s:02d}.{start_ms:03d}"
1362
+ end_time = f"{end_h}:{end_m:02d}:{end_s:02d}.{end_ms:03d}"
1363
+
1364
+ # Write timestamp line
1365
+ file.write(f"{start_time},{end_time}\n")
1366
+
1367
+ # Write text (with optional speaker)
1368
+ text = supervision.text.strip()
1369
+ if include_speaker_in_text and supervision.speaker:
1370
+ text = f"{supervision.speaker}: {text}"
1371
+
1372
+ file.write(f"{text}\n")
1373
+
1374
+ # Add blank line between entries (except after last one)
1375
+ if i < len(alignments) - 1:
1376
+ file.write("\n")
1377
+
1220
1378
  @classmethod
1221
1379
  def _parse_caption(
1222
1380
  cls, caption: Pathlike, format: Optional[OutputCaptionFormat], normalize_text: Optional[bool] = False
lattifai/cli/alignment.py CHANGED
@@ -81,7 +81,7 @@ def align(
81
81
  caption.word_level=true \\
82
82
  caption.normalize_text=true \\
83
83
  alignment.device=mps \\
84
- alignment.model_name=Lattifai/Lattice-1-Alpha
84
+ alignment.model_name=LattifAI/Lattice-1-Alpha
85
85
  """
86
86
  media_config = media or MediaConfig()
87
87
 
@@ -142,6 +142,7 @@ def align(
142
142
  output_caption_path=caption_config.output_path,
143
143
  split_sentence=caption_config.split_sentence,
144
144
  channel_selector=media_config.channel_selector,
145
+ streaming_chunk_secs=media_config.streaming_chunk_secs,
145
146
  )
146
147
 
147
148