lattifai 1.2.2__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/_init.py +20 -0
- lattifai/alignment/__init__.py +2 -3
- lattifai/alignment/lattice1_aligner.py +117 -4
- lattifai/alignment/lattice1_worker.py +47 -4
- lattifai/alignment/segmenter.py +3 -2
- lattifai/alignment/text_align.py +2 -1
- lattifai/alignment/tokenizer.py +56 -29
- lattifai/audio2.py +162 -183
- lattifai/cli/alignment.py +5 -0
- lattifai/cli/caption.py +6 -6
- lattifai/cli/transcribe.py +1 -5
- lattifai/cli/youtube.py +3 -0
- lattifai/client.py +41 -12
- lattifai/config/__init__.py +21 -3
- lattifai/config/alignment.py +7 -0
- lattifai/config/caption.py +13 -243
- lattifai/config/client.py +16 -0
- lattifai/config/event.py +102 -0
- lattifai/config/transcription.py +25 -1
- lattifai/data/__init__.py +8 -0
- lattifai/data/caption.py +228 -0
- lattifai/errors.py +78 -53
- lattifai/event/__init__.py +65 -0
- lattifai/event/lattifai.py +166 -0
- lattifai/mixin.py +22 -17
- lattifai/transcription/base.py +2 -1
- lattifai/transcription/gemini.py +147 -16
- lattifai/transcription/lattifai.py +8 -11
- lattifai/types.py +1 -1
- lattifai/youtube/client.py +143 -48
- {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/METADATA +129 -58
- lattifai-1.3.1.dist-info/RECORD +57 -0
- lattifai/__init__.py +0 -88
- lattifai/alignment/sentence_splitter.py +0 -350
- lattifai/caption/__init__.py +0 -96
- lattifai/caption/caption.py +0 -661
- lattifai/caption/formats/__init__.py +0 -199
- lattifai/caption/formats/base.py +0 -211
- lattifai/caption/formats/gemini.py +0 -722
- lattifai/caption/formats/json.py +0 -194
- lattifai/caption/formats/lrc.py +0 -309
- lattifai/caption/formats/nle/__init__.py +0 -9
- lattifai/caption/formats/nle/audition.py +0 -561
- lattifai/caption/formats/nle/avid.py +0 -423
- lattifai/caption/formats/nle/fcpxml.py +0 -549
- lattifai/caption/formats/nle/premiere.py +0 -589
- lattifai/caption/formats/pysubs2.py +0 -642
- lattifai/caption/formats/sbv.py +0 -147
- lattifai/caption/formats/tabular.py +0 -338
- lattifai/caption/formats/textgrid.py +0 -193
- lattifai/caption/formats/ttml.py +0 -652
- lattifai/caption/formats/vtt.py +0 -469
- lattifai/caption/parsers/__init__.py +0 -9
- lattifai/caption/parsers/text_parser.py +0 -147
- lattifai/caption/standardize.py +0 -636
- lattifai/caption/supervision.py +0 -34
- lattifai/caption/utils.py +0 -474
- lattifai-1.2.2.dist-info/RECORD +0 -76
- {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/WHEEL +0 -0
- {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/entry_points.txt +0 -0
- {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/top_level.txt +0 -0
lattifai/caption/caption.py
DELETED
|
@@ -1,661 +0,0 @@
|
|
|
1
|
-
"""Caption data structure for storing subtitle information with metadata."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
import io
|
|
6
|
-
from dataclasses import dataclass, field
|
|
7
|
-
from pathlib import Path
|
|
8
|
-
from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeVar, Union
|
|
9
|
-
|
|
10
|
-
if TYPE_CHECKING:
|
|
11
|
-
from ..config.caption import KaraokeConfig
|
|
12
|
-
|
|
13
|
-
from lhotse.supervision import AlignmentItem
|
|
14
|
-
from lhotse.utils import Pathlike
|
|
15
|
-
from tgt import TextGrid
|
|
16
|
-
|
|
17
|
-
from ..config.caption import InputCaptionFormat, OutputCaptionFormat # noqa: F401
|
|
18
|
-
from .formats import detect_format, get_reader, get_writer
|
|
19
|
-
from .supervision import Supervision
|
|
20
|
-
|
|
21
|
-
DiarizationOutput = TypeVar("DiarizationOutput")
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
@dataclass
|
|
25
|
-
class Caption:
|
|
26
|
-
"""
|
|
27
|
-
Container for caption/subtitle data with metadata.
|
|
28
|
-
|
|
29
|
-
This class encapsulates a list of supervisions (subtitle segments) along with
|
|
30
|
-
metadata such as language, kind, format information, and source file details.
|
|
31
|
-
|
|
32
|
-
Attributes:
|
|
33
|
-
supervisions: List of supervision segments containing text and timing information
|
|
34
|
-
language: Language code (e.g., 'en', 'zh', 'es')
|
|
35
|
-
kind: Caption kind/type (e.g., 'captions', 'subtitles', 'descriptions')
|
|
36
|
-
source_format: Original format of the caption file (e.g., 'vtt', 'srt', 'json')
|
|
37
|
-
source_path: Path to the source caption file
|
|
38
|
-
metadata: Additional custom metadata as key-value pairs
|
|
39
|
-
"""
|
|
40
|
-
|
|
41
|
-
# read from subtitle file
|
|
42
|
-
supervisions: List[Supervision] = field(default_factory=list)
|
|
43
|
-
# Transcription results
|
|
44
|
-
transcription: List[Supervision] = field(default_factory=list)
|
|
45
|
-
# Audio Event Detection results
|
|
46
|
-
audio_events: Optional[TextGrid] = None
|
|
47
|
-
# Speaker Diarization results
|
|
48
|
-
speaker_diarization: Optional[DiarizationOutput] = None
|
|
49
|
-
# Alignment results
|
|
50
|
-
alignments: List[Supervision] = field(default_factory=list)
|
|
51
|
-
|
|
52
|
-
language: Optional[str] = None
|
|
53
|
-
kind: Optional[str] = None
|
|
54
|
-
source_format: Optional[str] = None
|
|
55
|
-
source_path: Optional[Pathlike] = None
|
|
56
|
-
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
57
|
-
|
|
58
|
-
def __len__(self) -> int:
|
|
59
|
-
"""Return the number of supervision segments."""
|
|
60
|
-
return len(self.supervisions or self.transcription)
|
|
61
|
-
|
|
62
|
-
def __iter__(self):
|
|
63
|
-
"""Iterate over supervision segments."""
|
|
64
|
-
return iter(self.supervisions)
|
|
65
|
-
|
|
66
|
-
def __getitem__(self, index):
|
|
67
|
-
"""Get supervision segment by index."""
|
|
68
|
-
return self.supervisions[index]
|
|
69
|
-
|
|
70
|
-
def __bool__(self) -> bool:
|
|
71
|
-
"""Return True if caption has supervisions."""
|
|
72
|
-
return len(self) > 0
|
|
73
|
-
|
|
74
|
-
@property
|
|
75
|
-
def is_empty(self) -> bool:
|
|
76
|
-
"""Check if caption has no supervisions."""
|
|
77
|
-
return len(self.supervisions) == 0
|
|
78
|
-
|
|
79
|
-
@property
|
|
80
|
-
def duration(self) -> Optional[float]:
|
|
81
|
-
"""
|
|
82
|
-
Get total duration of the caption in seconds.
|
|
83
|
-
|
|
84
|
-
Returns:
|
|
85
|
-
Total duration from first to last supervision, or None if empty
|
|
86
|
-
"""
|
|
87
|
-
if not self.supervisions:
|
|
88
|
-
return None
|
|
89
|
-
return self.supervisions[-1].end - self.supervisions[0].start
|
|
90
|
-
|
|
91
|
-
@property
|
|
92
|
-
def start_time(self) -> Optional[float]:
|
|
93
|
-
"""Get start time of first supervision."""
|
|
94
|
-
if not self.supervisions:
|
|
95
|
-
return None
|
|
96
|
-
return self.supervisions[0].start
|
|
97
|
-
|
|
98
|
-
@property
|
|
99
|
-
def end_time(self) -> Optional[float]:
|
|
100
|
-
"""Get end time of last supervision."""
|
|
101
|
-
if not self.supervisions:
|
|
102
|
-
return None
|
|
103
|
-
return self.supervisions[-1].end
|
|
104
|
-
|
|
105
|
-
def append(self, supervision: Supervision) -> None:
|
|
106
|
-
"""Add a supervision segment to the caption."""
|
|
107
|
-
self.supervisions.append(supervision)
|
|
108
|
-
|
|
109
|
-
def extend(self, supervisions: List[Supervision]) -> None:
|
|
110
|
-
"""Add multiple supervision segments to the caption."""
|
|
111
|
-
self.supervisions.extend(supervisions)
|
|
112
|
-
|
|
113
|
-
def filter_by_speaker(self, speaker: str) -> "Caption":
|
|
114
|
-
"""
|
|
115
|
-
Create a new Caption with only supervisions from a specific speaker.
|
|
116
|
-
|
|
117
|
-
Args:
|
|
118
|
-
speaker: Speaker identifier to filter by
|
|
119
|
-
|
|
120
|
-
Returns:
|
|
121
|
-
New Caption instance with filtered supervisions
|
|
122
|
-
"""
|
|
123
|
-
filtered_sups = [sup for sup in self.supervisions if sup.speaker == speaker]
|
|
124
|
-
return Caption(
|
|
125
|
-
supervisions=filtered_sups,
|
|
126
|
-
language=self.language,
|
|
127
|
-
kind=self.kind,
|
|
128
|
-
source_format=self.source_format,
|
|
129
|
-
source_path=self.source_path,
|
|
130
|
-
metadata=self.metadata.copy(),
|
|
131
|
-
)
|
|
132
|
-
|
|
133
|
-
def get_speakers(self) -> List[str]:
|
|
134
|
-
"""
|
|
135
|
-
Get list of unique speakers in the caption.
|
|
136
|
-
|
|
137
|
-
Returns:
|
|
138
|
-
Sorted list of unique speaker identifiers
|
|
139
|
-
"""
|
|
140
|
-
speakers = {sup.speaker for sup in self.supervisions if sup.speaker}
|
|
141
|
-
return sorted(speakers)
|
|
142
|
-
|
|
143
|
-
def shift_time(self, seconds: float) -> "Caption":
|
|
144
|
-
"""
|
|
145
|
-
Create a new Caption with all timestamps shifted by given seconds.
|
|
146
|
-
|
|
147
|
-
Args:
|
|
148
|
-
seconds: Number of seconds to shift (positive delays, negative advances)
|
|
149
|
-
|
|
150
|
-
Returns:
|
|
151
|
-
New Caption instance with shifted timestamps
|
|
152
|
-
"""
|
|
153
|
-
shifted_sups = []
|
|
154
|
-
for sup in self.supervisions:
|
|
155
|
-
# Calculate physical time range
|
|
156
|
-
raw_start = sup.start + seconds
|
|
157
|
-
raw_end = sup.end + seconds
|
|
158
|
-
|
|
159
|
-
# Skip segments that end before 0
|
|
160
|
-
if raw_end <= 0:
|
|
161
|
-
continue
|
|
162
|
-
|
|
163
|
-
# Clip start to 0 if negative
|
|
164
|
-
if raw_start < 0:
|
|
165
|
-
final_start = 0.0
|
|
166
|
-
final_duration = raw_end
|
|
167
|
-
else:
|
|
168
|
-
final_start = raw_start
|
|
169
|
-
final_duration = sup.duration
|
|
170
|
-
|
|
171
|
-
# Handle alignment (word-level timestamps)
|
|
172
|
-
final_alignment = None
|
|
173
|
-
original_alignment = getattr(sup, "alignment", None)
|
|
174
|
-
if original_alignment and "word" in original_alignment:
|
|
175
|
-
new_words = []
|
|
176
|
-
for word in original_alignment["word"]:
|
|
177
|
-
w_start = word.start + seconds
|
|
178
|
-
w_end = w_start + word.duration
|
|
179
|
-
|
|
180
|
-
# Skip words that end before 0
|
|
181
|
-
if w_end <= 0:
|
|
182
|
-
continue
|
|
183
|
-
|
|
184
|
-
# Clip start to 0 if negative
|
|
185
|
-
if w_start < 0:
|
|
186
|
-
w_final_start = 0.0
|
|
187
|
-
w_final_duration = w_end
|
|
188
|
-
else:
|
|
189
|
-
w_final_start = w_start
|
|
190
|
-
w_final_duration = word.duration
|
|
191
|
-
|
|
192
|
-
new_words.append(
|
|
193
|
-
AlignmentItem(
|
|
194
|
-
symbol=word.symbol,
|
|
195
|
-
start=w_final_start,
|
|
196
|
-
duration=w_final_duration,
|
|
197
|
-
score=word.score,
|
|
198
|
-
)
|
|
199
|
-
)
|
|
200
|
-
|
|
201
|
-
# Copy original alignment dict structure and update words
|
|
202
|
-
final_alignment = original_alignment.copy()
|
|
203
|
-
final_alignment["word"] = new_words
|
|
204
|
-
|
|
205
|
-
shifted_sups.append(
|
|
206
|
-
Supervision(
|
|
207
|
-
text=sup.text,
|
|
208
|
-
start=final_start,
|
|
209
|
-
duration=final_duration,
|
|
210
|
-
speaker=sup.speaker,
|
|
211
|
-
id=sup.id,
|
|
212
|
-
recording_id=sup.recording_id if hasattr(sup, "recording_id") else "",
|
|
213
|
-
channel=getattr(sup, "channel", 0),
|
|
214
|
-
language=sup.language,
|
|
215
|
-
alignment=final_alignment,
|
|
216
|
-
custom=sup.custom,
|
|
217
|
-
)
|
|
218
|
-
)
|
|
219
|
-
|
|
220
|
-
return Caption(
|
|
221
|
-
supervisions=shifted_sups,
|
|
222
|
-
language=self.language,
|
|
223
|
-
kind=self.kind,
|
|
224
|
-
source_format=self.source_format,
|
|
225
|
-
source_path=self.source_path,
|
|
226
|
-
metadata=self.metadata.copy(),
|
|
227
|
-
)
|
|
228
|
-
|
|
229
|
-
def with_margins(
|
|
230
|
-
self,
|
|
231
|
-
start_margin: float = 0.08,
|
|
232
|
-
end_margin: float = 0.20,
|
|
233
|
-
min_gap: float = 0.08,
|
|
234
|
-
collision_mode: str = "trim",
|
|
235
|
-
) -> "Caption":
|
|
236
|
-
"""
|
|
237
|
-
Create a new Caption with segment boundaries adjusted based on word-level alignment.
|
|
238
|
-
|
|
239
|
-
Uses supervision.alignment['word'] to recalculate segment start/end times
|
|
240
|
-
with the specified margins applied around the actual speech boundaries.
|
|
241
|
-
|
|
242
|
-
Args:
|
|
243
|
-
start_margin: Seconds to extend before the first word (default: 0.08)
|
|
244
|
-
end_margin: Seconds to extend after the last word (default: 0.20)
|
|
245
|
-
min_gap: Minimum gap between segments for collision handling (default: 0.08)
|
|
246
|
-
collision_mode: How to handle segment overlap - 'trim' or 'gap' (default: 'trim')
|
|
247
|
-
|
|
248
|
-
Returns:
|
|
249
|
-
New Caption instance with adjusted timestamps
|
|
250
|
-
|
|
251
|
-
Note:
|
|
252
|
-
Segments without alignment data will keep their original timestamps.
|
|
253
|
-
|
|
254
|
-
Example:
|
|
255
|
-
>>> caption = Caption.read("aligned.srt")
|
|
256
|
-
>>> adjusted = caption.with_margins(start_margin=0.05, end_margin=0.15)
|
|
257
|
-
>>> adjusted.write("output.srt")
|
|
258
|
-
"""
|
|
259
|
-
from .standardize import apply_margins_to_captions
|
|
260
|
-
|
|
261
|
-
# Determine which supervisions to use
|
|
262
|
-
if self.alignments:
|
|
263
|
-
source_sups = self.alignments
|
|
264
|
-
elif self.supervisions:
|
|
265
|
-
source_sups = self.supervisions
|
|
266
|
-
else:
|
|
267
|
-
source_sups = self.transcription
|
|
268
|
-
|
|
269
|
-
adjusted_sups = apply_margins_to_captions(
|
|
270
|
-
source_sups,
|
|
271
|
-
start_margin=start_margin,
|
|
272
|
-
end_margin=end_margin,
|
|
273
|
-
min_gap=min_gap,
|
|
274
|
-
collision_mode=collision_mode,
|
|
275
|
-
)
|
|
276
|
-
|
|
277
|
-
return Caption(
|
|
278
|
-
supervisions=adjusted_sups,
|
|
279
|
-
transcription=self.transcription,
|
|
280
|
-
audio_events=self.audio_events,
|
|
281
|
-
speaker_diarization=self.speaker_diarization,
|
|
282
|
-
alignments=[], # Clear alignments since we've applied them
|
|
283
|
-
language=self.language,
|
|
284
|
-
kind=self.kind,
|
|
285
|
-
source_format=self.source_format,
|
|
286
|
-
source_path=self.source_path,
|
|
287
|
-
metadata=self.metadata.copy(),
|
|
288
|
-
)
|
|
289
|
-
|
|
290
|
-
def to_string(
|
|
291
|
-
self,
|
|
292
|
-
format: str = "srt",
|
|
293
|
-
word_level: bool = False,
|
|
294
|
-
karaoke_config: Optional["KaraokeConfig"] = None,
|
|
295
|
-
metadata: Optional[Dict[str, Any]] = None,
|
|
296
|
-
) -> str:
|
|
297
|
-
"""
|
|
298
|
-
Return caption content in specified format.
|
|
299
|
-
|
|
300
|
-
Args:
|
|
301
|
-
format: Output format (e.g., 'srt', 'vtt', 'ass')
|
|
302
|
-
word_level: Use word-level output format if supported (e.g., LRC, ASS, TTML)
|
|
303
|
-
karaoke_config: Karaoke configuration. When provided with enabled=True,
|
|
304
|
-
enables karaoke styling (ASS \\kf tags, enhanced LRC, etc.)
|
|
305
|
-
metadata: Optional metadata dict to pass to writer. If None, uses self.metadata.
|
|
306
|
-
|
|
307
|
-
Returns:
|
|
308
|
-
String containing formatted captions
|
|
309
|
-
"""
|
|
310
|
-
return self.to_bytes(
|
|
311
|
-
output_format=format, word_level=word_level, karaoke_config=karaoke_config, metadata=metadata
|
|
312
|
-
).decode("utf-8")
|
|
313
|
-
|
|
314
|
-
def to_dict(self) -> Dict:
|
|
315
|
-
"""
|
|
316
|
-
Convert Caption to dictionary representation.
|
|
317
|
-
|
|
318
|
-
Returns:
|
|
319
|
-
Dictionary with caption data and metadata
|
|
320
|
-
"""
|
|
321
|
-
return {
|
|
322
|
-
"supervisions": [sup.to_dict() for sup in self.supervisions],
|
|
323
|
-
"language": self.language,
|
|
324
|
-
"kind": self.kind,
|
|
325
|
-
"source_format": self.source_format,
|
|
326
|
-
"source_path": str(self.source_path) if self.source_path else None,
|
|
327
|
-
"metadata": self.metadata,
|
|
328
|
-
"duration": self.duration,
|
|
329
|
-
"num_segments": len(self.supervisions),
|
|
330
|
-
"speakers": self.get_speakers(),
|
|
331
|
-
}
|
|
332
|
-
|
|
333
|
-
@classmethod
|
|
334
|
-
def from_supervisions(
|
|
335
|
-
cls,
|
|
336
|
-
supervisions: List[Supervision],
|
|
337
|
-
language: Optional[str] = None,
|
|
338
|
-
kind: Optional[str] = None,
|
|
339
|
-
source_format: Optional[str] = None,
|
|
340
|
-
source_path: Optional[Pathlike] = None,
|
|
341
|
-
metadata: Optional[Dict[str, str]] = None,
|
|
342
|
-
) -> "Caption":
|
|
343
|
-
"""
|
|
344
|
-
Create Caption from a list of supervisions.
|
|
345
|
-
|
|
346
|
-
Args:
|
|
347
|
-
supervisions: List of supervision segments
|
|
348
|
-
language: Language code
|
|
349
|
-
kind: Caption kind/type
|
|
350
|
-
source_format: Original format
|
|
351
|
-
source_path: Source file path
|
|
352
|
-
metadata: Additional metadata
|
|
353
|
-
|
|
354
|
-
Returns:
|
|
355
|
-
New Caption instance
|
|
356
|
-
"""
|
|
357
|
-
return cls(
|
|
358
|
-
supervisions=supervisions,
|
|
359
|
-
language=language,
|
|
360
|
-
kind=kind,
|
|
361
|
-
source_format=source_format,
|
|
362
|
-
source_path=source_path,
|
|
363
|
-
metadata=metadata or {},
|
|
364
|
-
)
|
|
365
|
-
|
|
366
|
-
@classmethod
|
|
367
|
-
def from_string(
|
|
368
|
-
cls,
|
|
369
|
-
content: str,
|
|
370
|
-
format: str,
|
|
371
|
-
normalize_text: bool = True,
|
|
372
|
-
) -> "Caption":
|
|
373
|
-
"""
|
|
374
|
-
Create Caption from string content.
|
|
375
|
-
|
|
376
|
-
Args:
|
|
377
|
-
content: Caption content as string
|
|
378
|
-
format: Caption format (e.g., 'srt', 'vtt', 'ass')
|
|
379
|
-
normalize_text: Whether to normalize text during reading
|
|
380
|
-
|
|
381
|
-
Returns:
|
|
382
|
-
New Caption instance
|
|
383
|
-
|
|
384
|
-
Example:
|
|
385
|
-
>>> srt_content = \"\"\"1
|
|
386
|
-
... 00:00:00,000 --> 00:00:02,000
|
|
387
|
-
... Hello world\"\"\"
|
|
388
|
-
>>> caption = Caption.from_string(srt_content, format=\"srt\")
|
|
389
|
-
"""
|
|
390
|
-
buffer = io.StringIO(content)
|
|
391
|
-
return cls.read(buffer, format=format, normalize_text=normalize_text)
|
|
392
|
-
|
|
393
|
-
def to_bytes(
|
|
394
|
-
self,
|
|
395
|
-
output_format: Optional[str] = None,
|
|
396
|
-
include_speaker_in_text: bool = True,
|
|
397
|
-
word_level: bool = False,
|
|
398
|
-
karaoke_config: Optional["KaraokeConfig"] = None,
|
|
399
|
-
metadata: Optional[Dict[str, Any]] = None,
|
|
400
|
-
) -> bytes:
|
|
401
|
-
"""
|
|
402
|
-
Convert caption to bytes.
|
|
403
|
-
|
|
404
|
-
Args:
|
|
405
|
-
output_format: Output format (e.g., 'srt', 'vtt', 'ass'). Defaults to source_format or 'srt'
|
|
406
|
-
include_speaker_in_text: Whether to include speaker labels in text
|
|
407
|
-
word_level: Use word-level output format if supported (e.g., LRC, ASS, TTML)
|
|
408
|
-
karaoke_config: Karaoke configuration. When provided with enabled=True,
|
|
409
|
-
enables karaoke styling (ASS \\kf tags, enhanced LRC, etc.)
|
|
410
|
-
metadata: Optional metadata dict to pass to writer. If None, uses self.metadata.
|
|
411
|
-
|
|
412
|
-
Returns:
|
|
413
|
-
Caption content as bytes
|
|
414
|
-
|
|
415
|
-
Example:
|
|
416
|
-
>>> caption = Caption.read("input.srt")
|
|
417
|
-
>>> # Get as bytes in original format
|
|
418
|
-
>>> data = caption.to_bytes()
|
|
419
|
-
>>> # Get as bytes in specific format
|
|
420
|
-
>>> vtt_data = caption.to_bytes(output_format="vtt")
|
|
421
|
-
"""
|
|
422
|
-
return self.write(
|
|
423
|
-
None,
|
|
424
|
-
output_format=output_format,
|
|
425
|
-
include_speaker_in_text=include_speaker_in_text,
|
|
426
|
-
word_level=word_level,
|
|
427
|
-
karaoke_config=karaoke_config,
|
|
428
|
-
metadata=metadata,
|
|
429
|
-
)
|
|
430
|
-
|
|
431
|
-
@classmethod
|
|
432
|
-
def from_transcription_results(
|
|
433
|
-
cls,
|
|
434
|
-
transcription: List[Supervision],
|
|
435
|
-
audio_events: Optional[TextGrid] = None,
|
|
436
|
-
speaker_diarization: Optional[DiarizationOutput] = None,
|
|
437
|
-
language: Optional[str] = None,
|
|
438
|
-
source_path: Optional[Pathlike] = None,
|
|
439
|
-
metadata: Optional[Dict[str, str]] = None,
|
|
440
|
-
) -> "Caption":
|
|
441
|
-
"""
|
|
442
|
-
Create Caption from transcription results including audio events and diarization.
|
|
443
|
-
|
|
444
|
-
Args:
|
|
445
|
-
transcription: List of transcription supervision segments
|
|
446
|
-
audio_events: Optional TextGrid with audio event detection results
|
|
447
|
-
speaker_diarization: Optional DiarizationOutput with speaker diarization results
|
|
448
|
-
language: Language code
|
|
449
|
-
source_path: Source file path
|
|
450
|
-
metadata: Additional metadata
|
|
451
|
-
|
|
452
|
-
Returns:
|
|
453
|
-
New Caption instance with transcription data
|
|
454
|
-
"""
|
|
455
|
-
return cls(
|
|
456
|
-
transcription=transcription,
|
|
457
|
-
audio_events=audio_events,
|
|
458
|
-
speaker_diarization=speaker_diarization,
|
|
459
|
-
language=language,
|
|
460
|
-
kind="transcription",
|
|
461
|
-
source_format="asr",
|
|
462
|
-
source_path=source_path,
|
|
463
|
-
metadata=metadata or {},
|
|
464
|
-
)
|
|
465
|
-
|
|
466
|
-
@classmethod
|
|
467
|
-
def read(
|
|
468
|
-
cls,
|
|
469
|
-
path: Union[Pathlike, io.BytesIO, io.StringIO],
|
|
470
|
-
format: Optional[str] = None,
|
|
471
|
-
normalize_text: bool = True,
|
|
472
|
-
) -> "Caption":
|
|
473
|
-
"""
|
|
474
|
-
Read caption file or in-memory data and return Caption object.
|
|
475
|
-
|
|
476
|
-
Args:
|
|
477
|
-
path: Path to caption file, or BytesIO/StringIO object with caption content
|
|
478
|
-
format: Caption format (auto-detected if not provided, required for in-memory data)
|
|
479
|
-
normalize_text: Whether to normalize text during reading
|
|
480
|
-
|
|
481
|
-
Returns:
|
|
482
|
-
Caption object containing supervisions and metadata
|
|
483
|
-
"""
|
|
484
|
-
# Detect format if not provided
|
|
485
|
-
if not format:
|
|
486
|
-
if isinstance(path, (io.BytesIO, io.StringIO)):
|
|
487
|
-
raise ValueError("format parameter is required when reading from BytesIO/StringIO")
|
|
488
|
-
format = detect_format(str(path))
|
|
489
|
-
|
|
490
|
-
if not format:
|
|
491
|
-
# Fallback to extension
|
|
492
|
-
if not isinstance(path, (io.BytesIO, io.StringIO)):
|
|
493
|
-
format = Path(str(path)).suffix.lstrip(".").lower()
|
|
494
|
-
|
|
495
|
-
if not format:
|
|
496
|
-
format = "srt" # Last resort default
|
|
497
|
-
|
|
498
|
-
# Get content if it's an in-memory buffer
|
|
499
|
-
source = path
|
|
500
|
-
if isinstance(path, io.BytesIO):
|
|
501
|
-
source = path.read().decode("utf-8")
|
|
502
|
-
elif isinstance(path, io.StringIO):
|
|
503
|
-
source = path.read()
|
|
504
|
-
|
|
505
|
-
# Reset buffer position if it was a stream
|
|
506
|
-
if isinstance(path, (io.BytesIO, io.StringIO)):
|
|
507
|
-
path.seek(0)
|
|
508
|
-
|
|
509
|
-
# Get reader and perform extraction
|
|
510
|
-
reader_cls = get_reader(format)
|
|
511
|
-
if not reader_cls:
|
|
512
|
-
# Use pysubs2 as a generic fallback if no specific reader exists
|
|
513
|
-
from .formats.pysubs2 import Pysubs2Format
|
|
514
|
-
|
|
515
|
-
reader_cls = Pysubs2Format
|
|
516
|
-
|
|
517
|
-
supervisions = reader_cls.read(source, normalize_text=normalize_text)
|
|
518
|
-
metadata = reader_cls.extract_metadata(source)
|
|
519
|
-
|
|
520
|
-
# Create Caption object
|
|
521
|
-
source_path = None
|
|
522
|
-
if isinstance(path, (str, Path)) and not ("\n" in str(path) or len(str(path)) > 500):
|
|
523
|
-
try:
|
|
524
|
-
p = Path(str(path))
|
|
525
|
-
if p.exists():
|
|
526
|
-
source_path = str(p)
|
|
527
|
-
except (OSError, ValueError):
|
|
528
|
-
pass
|
|
529
|
-
|
|
530
|
-
return cls(
|
|
531
|
-
supervisions=supervisions,
|
|
532
|
-
language=metadata.get("language"),
|
|
533
|
-
kind=metadata.get("kind"),
|
|
534
|
-
source_format=format,
|
|
535
|
-
source_path=source_path,
|
|
536
|
-
metadata=metadata,
|
|
537
|
-
)
|
|
538
|
-
|
|
539
|
-
def write(
|
|
540
|
-
self,
|
|
541
|
-
path: Union[Pathlike, io.BytesIO, None] = None,
|
|
542
|
-
output_format: Optional[str] = None,
|
|
543
|
-
include_speaker_in_text: bool = True,
|
|
544
|
-
word_level: bool = False,
|
|
545
|
-
karaoke_config: Optional["KaraokeConfig"] = None,
|
|
546
|
-
metadata: Optional[Dict[str, Any]] = None,
|
|
547
|
-
) -> Union[Pathlike, bytes]:
|
|
548
|
-
"""
|
|
549
|
-
Write caption to file or return as bytes.
|
|
550
|
-
|
|
551
|
-
Args:
|
|
552
|
-
path: Path to output caption file, BytesIO object, or None to return bytes
|
|
553
|
-
output_format: Output format (e.g., 'srt', 'vtt', 'ass')
|
|
554
|
-
include_speaker_in_text: Whether to include speaker labels in text
|
|
555
|
-
word_level: Use word-level output format if supported (e.g., LRC, ASS, TTML)
|
|
556
|
-
karaoke_config: Karaoke configuration. When provided with enabled=True,
|
|
557
|
-
enables karaoke styling (ASS \\kf tags, enhanced LRC, etc.)
|
|
558
|
-
metadata: Optional metadata dict to pass to writer. If None, uses self.metadata.
|
|
559
|
-
Can be used to override or supplement format-specific metadata.
|
|
560
|
-
|
|
561
|
-
Returns:
|
|
562
|
-
Path to the written file if path is a file path, or bytes if path is BytesIO/None
|
|
563
|
-
"""
|
|
564
|
-
if self.alignments:
|
|
565
|
-
supervisions = self.alignments
|
|
566
|
-
elif self.supervisions:
|
|
567
|
-
supervisions = self.supervisions
|
|
568
|
-
else:
|
|
569
|
-
supervisions = self.transcription
|
|
570
|
-
|
|
571
|
-
# Merge external metadata with self.metadata (external takes precedence)
|
|
572
|
-
effective_metadata = dict(self.metadata) if self.metadata else {}
|
|
573
|
-
if metadata:
|
|
574
|
-
effective_metadata.update(metadata)
|
|
575
|
-
|
|
576
|
-
# Determine output format
|
|
577
|
-
if output_format:
|
|
578
|
-
output_format = output_format.lower()
|
|
579
|
-
elif isinstance(path, (io.BytesIO, type(None))):
|
|
580
|
-
output_format = self.source_format or "srt"
|
|
581
|
-
else:
|
|
582
|
-
output_format = detect_format(str(path)) or Path(str(path)).suffix.lstrip(".").lower() or "srt"
|
|
583
|
-
|
|
584
|
-
# Special casing for professional formats as before
|
|
585
|
-
ext = output_format
|
|
586
|
-
if isinstance(path, (str, Path)):
|
|
587
|
-
path_str = str(path)
|
|
588
|
-
if path_str.endswith("_avid.txt"):
|
|
589
|
-
ext = "avid_ds"
|
|
590
|
-
elif "audition" in path_str.lower() and path_str.endswith(".csv"):
|
|
591
|
-
ext = "audition_csv"
|
|
592
|
-
elif "edimarker" in path_str.lower() and path_str.endswith(".csv"):
|
|
593
|
-
ext = "edimarker_csv"
|
|
594
|
-
elif "imsc" in path_str.lower() and path_str.endswith(".ttml"):
|
|
595
|
-
ext = "imsc1"
|
|
596
|
-
elif "ebu" in path_str.lower() and path_str.endswith(".ttml"):
|
|
597
|
-
ext = "ebu_tt_d"
|
|
598
|
-
|
|
599
|
-
writer_cls = get_writer(ext)
|
|
600
|
-
if not writer_cls:
|
|
601
|
-
from .formats.pysubs2 import Pysubs2Format
|
|
602
|
-
|
|
603
|
-
writer_cls = Pysubs2Format
|
|
604
|
-
|
|
605
|
-
if isinstance(path, (str, Path)):
|
|
606
|
-
return writer_cls.write(
|
|
607
|
-
supervisions,
|
|
608
|
-
path,
|
|
609
|
-
include_speaker=include_speaker_in_text,
|
|
610
|
-
word_level=word_level,
|
|
611
|
-
karaoke_config=karaoke_config,
|
|
612
|
-
metadata=effective_metadata,
|
|
613
|
-
)
|
|
614
|
-
|
|
615
|
-
content = writer_cls.to_bytes(
|
|
616
|
-
supervisions,
|
|
617
|
-
include_speaker=include_speaker_in_text,
|
|
618
|
-
word_level=word_level,
|
|
619
|
-
karaoke_config=karaoke_config,
|
|
620
|
-
metadata=effective_metadata,
|
|
621
|
-
)
|
|
622
|
-
if isinstance(path, io.BytesIO):
|
|
623
|
-
path.write(content)
|
|
624
|
-
path.seek(0)
|
|
625
|
-
return content
|
|
626
|
-
|
|
627
|
-
def read_speaker_diarization(
|
|
628
|
-
self,
|
|
629
|
-
path: Pathlike,
|
|
630
|
-
) -> "DiarizationOutput":
|
|
631
|
-
"""
|
|
632
|
-
Read speaker diarization TextGrid from file.
|
|
633
|
-
"""
|
|
634
|
-
from lattifai_core.diarization import DiarizationOutput
|
|
635
|
-
|
|
636
|
-
self.speaker_diarization = DiarizationOutput.read(path)
|
|
637
|
-
return self.speaker_diarization
|
|
638
|
-
|
|
639
|
-
def write_speaker_diarization(
|
|
640
|
-
self,
|
|
641
|
-
path: Pathlike,
|
|
642
|
-
) -> Pathlike:
|
|
643
|
-
"""
|
|
644
|
-
Write speaker diarization TextGrid to file.
|
|
645
|
-
"""
|
|
646
|
-
if not self.speaker_diarization:
|
|
647
|
-
raise ValueError("No speaker diarization data to write.")
|
|
648
|
-
|
|
649
|
-
self.speaker_diarization.write(path)
|
|
650
|
-
return path
|
|
651
|
-
|
|
652
|
-
def __repr__(self) -> str:
|
|
653
|
-
"""String representation of Caption."""
|
|
654
|
-
lang = f"lang={self.language}" if self.language else "lang=unknown"
|
|
655
|
-
kind_str = f"kind={self.kind}" if self.kind else ""
|
|
656
|
-
parts = [f"Caption({len(self.supervisions or self.transcription)} segments", lang]
|
|
657
|
-
if kind_str:
|
|
658
|
-
parts.append(kind_str)
|
|
659
|
-
if self.duration:
|
|
660
|
-
parts.append(f"duration={self.duration:.2f}s")
|
|
661
|
-
return ", ".join(parts) + ")"
|