lattifai 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/__init__.py +0 -24
- lattifai/alignment/__init__.py +10 -1
- lattifai/alignment/lattice1_aligner.py +66 -58
- lattifai/alignment/lattice1_worker.py +1 -6
- lattifai/alignment/punctuation.py +38 -0
- lattifai/alignment/segmenter.py +1 -1
- lattifai/alignment/sentence_splitter.py +350 -0
- lattifai/alignment/text_align.py +440 -0
- lattifai/alignment/tokenizer.py +91 -220
- lattifai/caption/__init__.py +82 -6
- lattifai/caption/caption.py +335 -1143
- lattifai/caption/formats/__init__.py +199 -0
- lattifai/caption/formats/base.py +211 -0
- lattifai/caption/formats/gemini.py +722 -0
- lattifai/caption/formats/json.py +194 -0
- lattifai/caption/formats/lrc.py +309 -0
- lattifai/caption/formats/nle/__init__.py +9 -0
- lattifai/caption/formats/nle/audition.py +561 -0
- lattifai/caption/formats/nle/avid.py +423 -0
- lattifai/caption/formats/nle/fcpxml.py +549 -0
- lattifai/caption/formats/nle/premiere.py +589 -0
- lattifai/caption/formats/pysubs2.py +642 -0
- lattifai/caption/formats/sbv.py +147 -0
- lattifai/caption/formats/tabular.py +338 -0
- lattifai/caption/formats/textgrid.py +193 -0
- lattifai/caption/formats/ttml.py +652 -0
- lattifai/caption/formats/vtt.py +469 -0
- lattifai/caption/parsers/__init__.py +9 -0
- lattifai/caption/{text_parser.py → parsers/text_parser.py} +4 -2
- lattifai/caption/standardize.py +636 -0
- lattifai/caption/utils.py +474 -0
- lattifai/cli/__init__.py +2 -1
- lattifai/cli/caption.py +108 -1
- lattifai/cli/transcribe.py +4 -9
- lattifai/cli/youtube.py +4 -1
- lattifai/client.py +48 -84
- lattifai/config/__init__.py +11 -1
- lattifai/config/alignment.py +9 -2
- lattifai/config/caption.py +267 -23
- lattifai/config/media.py +20 -0
- lattifai/diarization/__init__.py +41 -1
- lattifai/mixin.py +36 -18
- lattifai/transcription/base.py +6 -1
- lattifai/transcription/lattifai.py +19 -54
- lattifai/utils.py +81 -13
- lattifai/workflow/__init__.py +28 -4
- lattifai/workflow/file_manager.py +2 -5
- lattifai/youtube/__init__.py +43 -0
- lattifai/youtube/client.py +1170 -0
- lattifai/youtube/types.py +23 -0
- lattifai-1.2.2.dist-info/METADATA +615 -0
- lattifai-1.2.2.dist-info/RECORD +76 -0
- {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/entry_points.txt +1 -2
- lattifai/caption/gemini_reader.py +0 -371
- lattifai/caption/gemini_writer.py +0 -173
- lattifai/cli/app_installer.py +0 -142
- lattifai/cli/server.py +0 -44
- lattifai/server/app.py +0 -427
- lattifai/workflow/youtube.py +0 -577
- lattifai-1.2.0.dist-info/METADATA +0 -1133
- lattifai-1.2.0.dist-info/RECORD +0 -57
- {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/WHEEL +0 -0
- {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/top_level.txt +0 -0
lattifai/caption/caption.py
CHANGED
|
@@ -1,19 +1,22 @@
|
|
|
1
1
|
"""Caption data structure for storing subtitle information with metadata."""
|
|
2
2
|
|
|
3
|
-
import
|
|
4
|
-
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import io
|
|
5
6
|
from dataclasses import dataclass, field
|
|
6
7
|
from pathlib import Path
|
|
7
|
-
from typing import Any, Dict, List, Optional, TypeVar
|
|
8
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeVar, Union
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from ..config.caption import KaraokeConfig
|
|
8
12
|
|
|
9
13
|
from lhotse.supervision import AlignmentItem
|
|
10
14
|
from lhotse.utils import Pathlike
|
|
11
15
|
from tgt import TextGrid
|
|
12
16
|
|
|
13
17
|
from ..config.caption import InputCaptionFormat, OutputCaptionFormat # noqa: F401
|
|
18
|
+
from .formats import detect_format, get_reader, get_writer
|
|
14
19
|
from .supervision import Supervision
|
|
15
|
-
from .text_parser import normalize_text as normalize_text_fn
|
|
16
|
-
from .text_parser import parse_speaker_text, parse_timestamp_text
|
|
17
20
|
|
|
18
21
|
DiarizationOutput = TypeVar("DiarizationOutput")
|
|
19
22
|
|
|
@@ -50,7 +53,7 @@ class Caption:
|
|
|
50
53
|
kind: Optional[str] = None
|
|
51
54
|
source_format: Optional[str] = None
|
|
52
55
|
source_path: Optional[Pathlike] = None
|
|
53
|
-
metadata: Dict[str,
|
|
56
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
54
57
|
|
|
55
58
|
def __len__(self) -> int:
|
|
56
59
|
"""Return the number of supervision segments."""
|
|
@@ -66,7 +69,7 @@ class Caption:
|
|
|
66
69
|
|
|
67
70
|
def __bool__(self) -> bool:
|
|
68
71
|
"""Return True if caption has supervisions."""
|
|
69
|
-
return self
|
|
72
|
+
return len(self) > 0
|
|
70
73
|
|
|
71
74
|
@property
|
|
72
75
|
def is_empty(self) -> bool:
|
|
@@ -147,19 +150,72 @@ class Caption:
|
|
|
147
150
|
Returns:
|
|
148
151
|
New Caption instance with shifted timestamps
|
|
149
152
|
"""
|
|
150
|
-
shifted_sups = [
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
153
|
+
shifted_sups = []
|
|
154
|
+
for sup in self.supervisions:
|
|
155
|
+
# Calculate physical time range
|
|
156
|
+
raw_start = sup.start + seconds
|
|
157
|
+
raw_end = sup.end + seconds
|
|
158
|
+
|
|
159
|
+
# Skip segments that end before 0
|
|
160
|
+
if raw_end <= 0:
|
|
161
|
+
continue
|
|
162
|
+
|
|
163
|
+
# Clip start to 0 if negative
|
|
164
|
+
if raw_start < 0:
|
|
165
|
+
final_start = 0.0
|
|
166
|
+
final_duration = raw_end
|
|
167
|
+
else:
|
|
168
|
+
final_start = raw_start
|
|
169
|
+
final_duration = sup.duration
|
|
170
|
+
|
|
171
|
+
# Handle alignment (word-level timestamps)
|
|
172
|
+
final_alignment = None
|
|
173
|
+
original_alignment = getattr(sup, "alignment", None)
|
|
174
|
+
if original_alignment and "word" in original_alignment:
|
|
175
|
+
new_words = []
|
|
176
|
+
for word in original_alignment["word"]:
|
|
177
|
+
w_start = word.start + seconds
|
|
178
|
+
w_end = w_start + word.duration
|
|
179
|
+
|
|
180
|
+
# Skip words that end before 0
|
|
181
|
+
if w_end <= 0:
|
|
182
|
+
continue
|
|
183
|
+
|
|
184
|
+
# Clip start to 0 if negative
|
|
185
|
+
if w_start < 0:
|
|
186
|
+
w_final_start = 0.0
|
|
187
|
+
w_final_duration = w_end
|
|
188
|
+
else:
|
|
189
|
+
w_final_start = w_start
|
|
190
|
+
w_final_duration = word.duration
|
|
191
|
+
|
|
192
|
+
new_words.append(
|
|
193
|
+
AlignmentItem(
|
|
194
|
+
symbol=word.symbol,
|
|
195
|
+
start=w_final_start,
|
|
196
|
+
duration=w_final_duration,
|
|
197
|
+
score=word.score,
|
|
198
|
+
)
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
# Copy original alignment dict structure and update words
|
|
202
|
+
final_alignment = original_alignment.copy()
|
|
203
|
+
final_alignment["word"] = new_words
|
|
204
|
+
|
|
205
|
+
shifted_sups.append(
|
|
206
|
+
Supervision(
|
|
207
|
+
text=sup.text,
|
|
208
|
+
start=final_start,
|
|
209
|
+
duration=final_duration,
|
|
210
|
+
speaker=sup.speaker,
|
|
211
|
+
id=sup.id,
|
|
212
|
+
recording_id=sup.recording_id if hasattr(sup, "recording_id") else "",
|
|
213
|
+
channel=getattr(sup, "channel", 0),
|
|
214
|
+
language=sup.language,
|
|
215
|
+
alignment=final_alignment,
|
|
216
|
+
custom=sup.custom,
|
|
217
|
+
)
|
|
160
218
|
)
|
|
161
|
-
for sup in self.supervisions
|
|
162
|
-
]
|
|
163
219
|
|
|
164
220
|
return Caption(
|
|
165
221
|
supervisions=shifted_sups,
|
|
@@ -170,52 +226,90 @@ class Caption:
|
|
|
170
226
|
metadata=self.metadata.copy(),
|
|
171
227
|
)
|
|
172
228
|
|
|
173
|
-
def
|
|
229
|
+
def with_margins(
|
|
230
|
+
self,
|
|
231
|
+
start_margin: float = 0.08,
|
|
232
|
+
end_margin: float = 0.20,
|
|
233
|
+
min_gap: float = 0.08,
|
|
234
|
+
collision_mode: str = "trim",
|
|
235
|
+
) -> "Caption":
|
|
174
236
|
"""
|
|
175
|
-
|
|
237
|
+
Create a new Caption with segment boundaries adjusted based on word-level alignment.
|
|
238
|
+
|
|
239
|
+
Uses supervision.alignment['word'] to recalculate segment start/end times
|
|
240
|
+
with the specified margins applied around the actual speech boundaries.
|
|
176
241
|
|
|
177
242
|
Args:
|
|
178
|
-
|
|
243
|
+
start_margin: Seconds to extend before the first word (default: 0.08)
|
|
244
|
+
end_margin: Seconds to extend after the last word (default: 0.20)
|
|
245
|
+
min_gap: Minimum gap between segments for collision handling (default: 0.08)
|
|
246
|
+
collision_mode: How to handle segment overlap - 'trim' or 'gap' (default: 'trim')
|
|
179
247
|
|
|
180
248
|
Returns:
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
249
|
+
New Caption instance with adjusted timestamps
|
|
250
|
+
|
|
251
|
+
Note:
|
|
252
|
+
Segments without alignment data will keep their original timestamps.
|
|
184
253
|
|
|
185
|
-
|
|
254
|
+
Example:
|
|
255
|
+
>>> caption = Caption.read("aligned.srt")
|
|
256
|
+
>>> adjusted = caption.with_margins(start_margin=0.05, end_margin=0.15)
|
|
257
|
+
>>> adjusted.write("output.srt")
|
|
258
|
+
"""
|
|
259
|
+
from .standardize import apply_margins_to_captions
|
|
186
260
|
|
|
261
|
+
# Determine which supervisions to use
|
|
187
262
|
if self.alignments:
|
|
188
|
-
|
|
263
|
+
source_sups = self.alignments
|
|
264
|
+
elif self.supervisions:
|
|
265
|
+
source_sups = self.supervisions
|
|
189
266
|
else:
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
267
|
+
source_sups = self.transcription
|
|
268
|
+
|
|
269
|
+
adjusted_sups = apply_margins_to_captions(
|
|
270
|
+
source_sups,
|
|
271
|
+
start_margin=start_margin,
|
|
272
|
+
end_margin=end_margin,
|
|
273
|
+
min_gap=min_gap,
|
|
274
|
+
collision_mode=collision_mode,
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
return Caption(
|
|
278
|
+
supervisions=adjusted_sups,
|
|
279
|
+
transcription=self.transcription,
|
|
280
|
+
audio_events=self.audio_events,
|
|
281
|
+
speaker_diarization=self.speaker_diarization,
|
|
282
|
+
alignments=[], # Clear alignments since we've applied them
|
|
283
|
+
language=self.language,
|
|
284
|
+
kind=self.kind,
|
|
285
|
+
source_format=self.source_format,
|
|
286
|
+
source_path=self.source_path,
|
|
287
|
+
metadata=self.metadata.copy(),
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
def to_string(
|
|
291
|
+
self,
|
|
292
|
+
format: str = "srt",
|
|
293
|
+
word_level: bool = False,
|
|
294
|
+
karaoke_config: Optional["KaraokeConfig"] = None,
|
|
295
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
296
|
+
) -> str:
|
|
297
|
+
"""
|
|
298
|
+
Return caption content in specified format.
|
|
217
299
|
|
|
218
|
-
|
|
300
|
+
Args:
|
|
301
|
+
format: Output format (e.g., 'srt', 'vtt', 'ass')
|
|
302
|
+
word_level: Use word-level output format if supported (e.g., LRC, ASS, TTML)
|
|
303
|
+
karaoke_config: Karaoke configuration. When provided with enabled=True,
|
|
304
|
+
enables karaoke styling (ASS \\kf tags, enhanced LRC, etc.)
|
|
305
|
+
metadata: Optional metadata dict to pass to writer. If None, uses self.metadata.
|
|
306
|
+
|
|
307
|
+
Returns:
|
|
308
|
+
String containing formatted captions
|
|
309
|
+
"""
|
|
310
|
+
return self.to_bytes(
|
|
311
|
+
output_format=format, word_level=word_level, karaoke_config=karaoke_config, metadata=metadata
|
|
312
|
+
).decode("utf-8")
|
|
219
313
|
|
|
220
314
|
def to_dict(self) -> Dict:
|
|
221
315
|
"""
|
|
@@ -269,6 +363,71 @@ class Caption:
|
|
|
269
363
|
metadata=metadata or {},
|
|
270
364
|
)
|
|
271
365
|
|
|
366
|
+
@classmethod
|
|
367
|
+
def from_string(
|
|
368
|
+
cls,
|
|
369
|
+
content: str,
|
|
370
|
+
format: str,
|
|
371
|
+
normalize_text: bool = True,
|
|
372
|
+
) -> "Caption":
|
|
373
|
+
"""
|
|
374
|
+
Create Caption from string content.
|
|
375
|
+
|
|
376
|
+
Args:
|
|
377
|
+
content: Caption content as string
|
|
378
|
+
format: Caption format (e.g., 'srt', 'vtt', 'ass')
|
|
379
|
+
normalize_text: Whether to normalize text during reading
|
|
380
|
+
|
|
381
|
+
Returns:
|
|
382
|
+
New Caption instance
|
|
383
|
+
|
|
384
|
+
Example:
|
|
385
|
+
>>> srt_content = \"\"\"1
|
|
386
|
+
... 00:00:00,000 --> 00:00:02,000
|
|
387
|
+
... Hello world\"\"\"
|
|
388
|
+
>>> caption = Caption.from_string(srt_content, format=\"srt\")
|
|
389
|
+
"""
|
|
390
|
+
buffer = io.StringIO(content)
|
|
391
|
+
return cls.read(buffer, format=format, normalize_text=normalize_text)
|
|
392
|
+
|
|
393
|
+
def to_bytes(
|
|
394
|
+
self,
|
|
395
|
+
output_format: Optional[str] = None,
|
|
396
|
+
include_speaker_in_text: bool = True,
|
|
397
|
+
word_level: bool = False,
|
|
398
|
+
karaoke_config: Optional["KaraokeConfig"] = None,
|
|
399
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
400
|
+
) -> bytes:
|
|
401
|
+
"""
|
|
402
|
+
Convert caption to bytes.
|
|
403
|
+
|
|
404
|
+
Args:
|
|
405
|
+
output_format: Output format (e.g., 'srt', 'vtt', 'ass'). Defaults to source_format or 'srt'
|
|
406
|
+
include_speaker_in_text: Whether to include speaker labels in text
|
|
407
|
+
word_level: Use word-level output format if supported (e.g., LRC, ASS, TTML)
|
|
408
|
+
karaoke_config: Karaoke configuration. When provided with enabled=True,
|
|
409
|
+
enables karaoke styling (ASS \\kf tags, enhanced LRC, etc.)
|
|
410
|
+
metadata: Optional metadata dict to pass to writer. If None, uses self.metadata.
|
|
411
|
+
|
|
412
|
+
Returns:
|
|
413
|
+
Caption content as bytes
|
|
414
|
+
|
|
415
|
+
Example:
|
|
416
|
+
>>> caption = Caption.read("input.srt")
|
|
417
|
+
>>> # Get as bytes in original format
|
|
418
|
+
>>> data = caption.to_bytes()
|
|
419
|
+
>>> # Get as bytes in specific format
|
|
420
|
+
>>> vtt_data = caption.to_bytes(output_format="vtt")
|
|
421
|
+
"""
|
|
422
|
+
return self.write(
|
|
423
|
+
None,
|
|
424
|
+
output_format=output_format,
|
|
425
|
+
include_speaker_in_text=include_speaker_in_text,
|
|
426
|
+
word_level=word_level,
|
|
427
|
+
karaoke_config=karaoke_config,
|
|
428
|
+
metadata=metadata,
|
|
429
|
+
)
|
|
430
|
+
|
|
272
431
|
@classmethod
|
|
273
432
|
def from_transcription_results(
|
|
274
433
|
cls,
|
|
@@ -307,82 +466,168 @@ class Caption:
|
|
|
307
466
|
@classmethod
|
|
308
467
|
def read(
|
|
309
468
|
cls,
|
|
310
|
-
path: Pathlike,
|
|
469
|
+
path: Union[Pathlike, io.BytesIO, io.StringIO],
|
|
311
470
|
format: Optional[str] = None,
|
|
312
471
|
normalize_text: bool = True,
|
|
313
472
|
) -> "Caption":
|
|
314
473
|
"""
|
|
315
|
-
Read caption file and return Caption object.
|
|
474
|
+
Read caption file or in-memory data and return Caption object.
|
|
316
475
|
|
|
317
476
|
Args:
|
|
318
|
-
path: Path to caption file
|
|
319
|
-
format: Caption format (auto-detected if not provided)
|
|
477
|
+
path: Path to caption file, or BytesIO/StringIO object with caption content
|
|
478
|
+
format: Caption format (auto-detected if not provided, required for in-memory data)
|
|
320
479
|
normalize_text: Whether to normalize text during reading
|
|
321
480
|
|
|
322
481
|
Returns:
|
|
323
482
|
Caption object containing supervisions and metadata
|
|
324
|
-
|
|
325
|
-
Example:
|
|
326
|
-
>>> caption = Caption.read("subtitles.srt")
|
|
327
|
-
>>> print(f"Loaded {len(caption)} segments")
|
|
328
483
|
"""
|
|
329
|
-
caption_path = Path(str(path)) if not isinstance(path, Path) else path
|
|
330
|
-
|
|
331
484
|
# Detect format if not provided
|
|
332
|
-
if not format
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
format =
|
|
485
|
+
if not format:
|
|
486
|
+
if isinstance(path, (io.BytesIO, io.StringIO)):
|
|
487
|
+
raise ValueError("format parameter is required when reading from BytesIO/StringIO")
|
|
488
|
+
format = detect_format(str(path))
|
|
489
|
+
|
|
490
|
+
if not format:
|
|
491
|
+
# Fallback to extension
|
|
492
|
+
if not isinstance(path, (io.BytesIO, io.StringIO)):
|
|
493
|
+
format = Path(str(path)).suffix.lstrip(".").lower()
|
|
494
|
+
|
|
495
|
+
if not format:
|
|
496
|
+
format = "srt" # Last resort default
|
|
336
497
|
|
|
337
|
-
#
|
|
338
|
-
|
|
498
|
+
# Get content if it's an in-memory buffer
|
|
499
|
+
source = path
|
|
500
|
+
if isinstance(path, io.BytesIO):
|
|
501
|
+
source = path.read().decode("utf-8")
|
|
502
|
+
elif isinstance(path, io.StringIO):
|
|
503
|
+
source = path.read()
|
|
339
504
|
|
|
340
|
-
#
|
|
341
|
-
|
|
505
|
+
# Reset buffer position if it was a stream
|
|
506
|
+
if isinstance(path, (io.BytesIO, io.StringIO)):
|
|
507
|
+
path.seek(0)
|
|
508
|
+
|
|
509
|
+
# Get reader and perform extraction
|
|
510
|
+
reader_cls = get_reader(format)
|
|
511
|
+
if not reader_cls:
|
|
512
|
+
# Use pysubs2 as a generic fallback if no specific reader exists
|
|
513
|
+
from .formats.pysubs2 import Pysubs2Format
|
|
514
|
+
|
|
515
|
+
reader_cls = Pysubs2Format
|
|
516
|
+
|
|
517
|
+
supervisions = reader_cls.read(source, normalize_text=normalize_text)
|
|
518
|
+
metadata = reader_cls.extract_metadata(source)
|
|
342
519
|
|
|
343
520
|
# Create Caption object
|
|
521
|
+
source_path = None
|
|
522
|
+
if isinstance(path, (str, Path)) and not ("\n" in str(path) or len(str(path)) > 500):
|
|
523
|
+
try:
|
|
524
|
+
p = Path(str(path))
|
|
525
|
+
if p.exists():
|
|
526
|
+
source_path = str(p)
|
|
527
|
+
except (OSError, ValueError):
|
|
528
|
+
pass
|
|
529
|
+
|
|
344
530
|
return cls(
|
|
345
531
|
supervisions=supervisions,
|
|
346
532
|
language=metadata.get("language"),
|
|
347
533
|
kind=metadata.get("kind"),
|
|
348
534
|
source_format=format,
|
|
349
|
-
source_path=
|
|
535
|
+
source_path=source_path,
|
|
350
536
|
metadata=metadata,
|
|
351
537
|
)
|
|
352
538
|
|
|
353
539
|
def write(
|
|
354
540
|
self,
|
|
355
|
-
path: Pathlike,
|
|
541
|
+
path: Union[Pathlike, io.BytesIO, None] = None,
|
|
542
|
+
output_format: Optional[str] = None,
|
|
356
543
|
include_speaker_in_text: bool = True,
|
|
357
|
-
|
|
544
|
+
word_level: bool = False,
|
|
545
|
+
karaoke_config: Optional["KaraokeConfig"] = None,
|
|
546
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
547
|
+
) -> Union[Pathlike, bytes]:
|
|
358
548
|
"""
|
|
359
|
-
Write caption to file.
|
|
549
|
+
Write caption to file or return as bytes.
|
|
360
550
|
|
|
361
551
|
Args:
|
|
362
|
-
path: Path to output caption file
|
|
552
|
+
path: Path to output caption file, BytesIO object, or None to return bytes
|
|
553
|
+
output_format: Output format (e.g., 'srt', 'vtt', 'ass')
|
|
363
554
|
include_speaker_in_text: Whether to include speaker labels in text
|
|
555
|
+
word_level: Use word-level output format if supported (e.g., LRC, ASS, TTML)
|
|
556
|
+
karaoke_config: Karaoke configuration. When provided with enabled=True,
|
|
557
|
+
enables karaoke styling (ASS \\kf tags, enhanced LRC, etc.)
|
|
558
|
+
metadata: Optional metadata dict to pass to writer. If None, uses self.metadata.
|
|
559
|
+
Can be used to override or supplement format-specific metadata.
|
|
364
560
|
|
|
365
561
|
Returns:
|
|
366
|
-
Path to the written file
|
|
367
|
-
|
|
368
|
-
Example:
|
|
369
|
-
>>> caption = Caption.read("input.srt")
|
|
370
|
-
>>> caption.write("output.vtt", include_speaker_in_text=False)
|
|
562
|
+
Path to the written file if path is a file path, or bytes if path is BytesIO/None
|
|
371
563
|
"""
|
|
372
564
|
if self.alignments:
|
|
373
|
-
|
|
565
|
+
supervisions = self.alignments
|
|
566
|
+
elif self.supervisions:
|
|
567
|
+
supervisions = self.supervisions
|
|
374
568
|
else:
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
569
|
+
supervisions = self.transcription
|
|
570
|
+
|
|
571
|
+
# Merge external metadata with self.metadata (external takes precedence)
|
|
572
|
+
effective_metadata = dict(self.metadata) if self.metadata else {}
|
|
573
|
+
if metadata:
|
|
574
|
+
effective_metadata.update(metadata)
|
|
575
|
+
|
|
576
|
+
# Determine output format
|
|
577
|
+
if output_format:
|
|
578
|
+
output_format = output_format.lower()
|
|
579
|
+
elif isinstance(path, (io.BytesIO, type(None))):
|
|
580
|
+
output_format = self.source_format or "srt"
|
|
581
|
+
else:
|
|
582
|
+
output_format = detect_format(str(path)) or Path(str(path)).suffix.lstrip(".").lower() or "srt"
|
|
583
|
+
|
|
584
|
+
# Special casing for professional formats as before
|
|
585
|
+
ext = output_format
|
|
586
|
+
if isinstance(path, (str, Path)):
|
|
587
|
+
path_str = str(path)
|
|
588
|
+
if path_str.endswith("_avid.txt"):
|
|
589
|
+
ext = "avid_ds"
|
|
590
|
+
elif "audition" in path_str.lower() and path_str.endswith(".csv"):
|
|
591
|
+
ext = "audition_csv"
|
|
592
|
+
elif "edimarker" in path_str.lower() and path_str.endswith(".csv"):
|
|
593
|
+
ext = "edimarker_csv"
|
|
594
|
+
elif "imsc" in path_str.lower() and path_str.endswith(".ttml"):
|
|
595
|
+
ext = "imsc1"
|
|
596
|
+
elif "ebu" in path_str.lower() and path_str.endswith(".ttml"):
|
|
597
|
+
ext = "ebu_tt_d"
|
|
598
|
+
|
|
599
|
+
writer_cls = get_writer(ext)
|
|
600
|
+
if not writer_cls:
|
|
601
|
+
from .formats.pysubs2 import Pysubs2Format
|
|
602
|
+
|
|
603
|
+
writer_cls = Pysubs2Format
|
|
604
|
+
|
|
605
|
+
if isinstance(path, (str, Path)):
|
|
606
|
+
return writer_cls.write(
|
|
607
|
+
supervisions,
|
|
608
|
+
path,
|
|
609
|
+
include_speaker=include_speaker_in_text,
|
|
610
|
+
word_level=word_level,
|
|
611
|
+
karaoke_config=karaoke_config,
|
|
612
|
+
metadata=effective_metadata,
|
|
613
|
+
)
|
|
379
614
|
|
|
380
|
-
|
|
615
|
+
content = writer_cls.to_bytes(
|
|
616
|
+
supervisions,
|
|
617
|
+
include_speaker=include_speaker_in_text,
|
|
618
|
+
word_level=word_level,
|
|
619
|
+
karaoke_config=karaoke_config,
|
|
620
|
+
metadata=effective_metadata,
|
|
621
|
+
)
|
|
622
|
+
if isinstance(path, io.BytesIO):
|
|
623
|
+
path.write(content)
|
|
624
|
+
path.seek(0)
|
|
625
|
+
return content
|
|
381
626
|
|
|
382
627
|
def read_speaker_diarization(
|
|
383
628
|
self,
|
|
384
629
|
path: Pathlike,
|
|
385
|
-
) ->
|
|
630
|
+
) -> "DiarizationOutput":
|
|
386
631
|
"""
|
|
387
632
|
Read speaker diarization TextGrid from file.
|
|
388
633
|
"""
|
|
@@ -404,1059 +649,6 @@ class Caption:
|
|
|
404
649
|
self.speaker_diarization.write(path)
|
|
405
650
|
return path
|
|
406
651
|
|
|
407
|
-
@staticmethod
|
|
408
|
-
def _parse_alignment_from_supervision(supervision: Any) -> Optional[List[AlignmentItem]]:
|
|
409
|
-
"""
|
|
410
|
-
Extract word-level alignment items from Supervision object.
|
|
411
|
-
|
|
412
|
-
Args:
|
|
413
|
-
supervision: Supervision object with potential alignment data
|
|
414
|
-
|
|
415
|
-
Returns:
|
|
416
|
-
List of AlignmentItem objects, or None if no alignment data present
|
|
417
|
-
"""
|
|
418
|
-
if not hasattr(supervision, "alignment") or not supervision.alignment:
|
|
419
|
-
return None
|
|
420
|
-
|
|
421
|
-
if "word" not in supervision.alignment:
|
|
422
|
-
return None
|
|
423
|
-
|
|
424
|
-
return supervision.alignment["word"]
|
|
425
|
-
|
|
426
|
-
@classmethod
|
|
427
|
-
def _write_caption(
|
|
428
|
-
cls,
|
|
429
|
-
alignments: List[Supervision],
|
|
430
|
-
output_path: Pathlike,
|
|
431
|
-
include_speaker_in_text: bool = True,
|
|
432
|
-
) -> Pathlike:
|
|
433
|
-
"""
|
|
434
|
-
Write caption to file in various formats.
|
|
435
|
-
|
|
436
|
-
Args:
|
|
437
|
-
alignments: List of supervision segments to write
|
|
438
|
-
output_path: Path to output file
|
|
439
|
-
include_speaker_in_text: Whether to include speaker in text
|
|
440
|
-
|
|
441
|
-
Returns:
|
|
442
|
-
Path to written file
|
|
443
|
-
"""
|
|
444
|
-
if str(output_path)[-4:].lower() == ".txt":
|
|
445
|
-
with open(output_path, "w", encoding="utf-8") as f:
|
|
446
|
-
for sup in alignments:
|
|
447
|
-
word_items = cls._parse_alignment_from_supervision(sup)
|
|
448
|
-
if word_items:
|
|
449
|
-
for item in word_items:
|
|
450
|
-
f.write(f"[{item.start:.2f}-{item.end:.2f}] {item.symbol}\n")
|
|
451
|
-
else:
|
|
452
|
-
if include_speaker_in_text and sup.speaker is not None:
|
|
453
|
-
# Use [SPEAKER]: format for consistency with parsing
|
|
454
|
-
if not sup.has_custom("original_speaker") or sup.custom["original_speaker"]:
|
|
455
|
-
text = f"[{sup.speaker}]: {sup.text}"
|
|
456
|
-
else:
|
|
457
|
-
text = f"{sup.text}"
|
|
458
|
-
else:
|
|
459
|
-
text = sup.text
|
|
460
|
-
f.write(f"[{sup.start:.2f}-{sup.end:.2f}] {text}\n")
|
|
461
|
-
|
|
462
|
-
elif str(output_path)[-5:].lower() == ".json":
|
|
463
|
-
with open(output_path, "w", encoding="utf-8") as f:
|
|
464
|
-
# Enhanced JSON export with word-level alignment
|
|
465
|
-
json_data = []
|
|
466
|
-
for sup in alignments:
|
|
467
|
-
sup_dict = sup.to_dict()
|
|
468
|
-
json_data.append(sup_dict)
|
|
469
|
-
json.dump(json_data, f, ensure_ascii=False, indent=4)
|
|
470
|
-
|
|
471
|
-
elif str(output_path).lower().endswith(".textgrid"):
|
|
472
|
-
from tgt import Interval, IntervalTier, TextGrid, write_to_file
|
|
473
|
-
|
|
474
|
-
tg = TextGrid()
|
|
475
|
-
supervisions, words, scores = [], [], {"utterances": [], "words": []}
|
|
476
|
-
for supervision in sorted(alignments, key=lambda x: x.start):
|
|
477
|
-
# Respect `original_speaker` custom flag: default to include speaker when missing
|
|
478
|
-
if (
|
|
479
|
-
include_speaker_in_text
|
|
480
|
-
and supervision.speaker is not None
|
|
481
|
-
and (not supervision.has_custom("original_speaker") or supervision.custom["original_speaker"])
|
|
482
|
-
):
|
|
483
|
-
text = f"{supervision.speaker} {supervision.text}"
|
|
484
|
-
else:
|
|
485
|
-
text = supervision.text
|
|
486
|
-
supervisions.append(Interval(supervision.start, supervision.end, text or ""))
|
|
487
|
-
# Extract word-level alignment using helper function
|
|
488
|
-
word_items = cls._parse_alignment_from_supervision(supervision)
|
|
489
|
-
if word_items:
|
|
490
|
-
for item in word_items:
|
|
491
|
-
words.append(Interval(item.start, item.end, item.symbol))
|
|
492
|
-
if item.score is not None:
|
|
493
|
-
scores["words"].append(Interval(item.start, item.end, f"{item.score:.2f}"))
|
|
494
|
-
if supervision.has_custom("score"):
|
|
495
|
-
scores["utterances"].append(
|
|
496
|
-
Interval(supervision.start, supervision.end, f"{supervision.score:.2f}")
|
|
497
|
-
)
|
|
498
|
-
|
|
499
|
-
tg.add_tier(IntervalTier(name="utterances", objects=supervisions))
|
|
500
|
-
if words:
|
|
501
|
-
tg.add_tier(IntervalTier(name="words", objects=words))
|
|
502
|
-
|
|
503
|
-
if scores["utterances"]:
|
|
504
|
-
tg.add_tier(IntervalTier(name="utterance_scores", objects=scores["utterances"]))
|
|
505
|
-
if scores["words"]:
|
|
506
|
-
tg.add_tier(IntervalTier(name="word_scores", objects=scores["words"]))
|
|
507
|
-
|
|
508
|
-
write_to_file(tg, output_path, format="long")
|
|
509
|
-
|
|
510
|
-
elif str(output_path)[-4:].lower() == ".tsv":
|
|
511
|
-
cls._write_tsv(alignments, output_path, include_speaker_in_text)
|
|
512
|
-
elif str(output_path)[-4:].lower() == ".csv":
|
|
513
|
-
cls._write_csv(alignments, output_path, include_speaker_in_text)
|
|
514
|
-
elif str(output_path)[-4:].lower() == ".aud":
|
|
515
|
-
cls._write_aud(alignments, output_path, include_speaker_in_text)
|
|
516
|
-
elif str(output_path)[-4:].lower() == ".sbv":
|
|
517
|
-
cls._write_sbv(alignments, output_path, include_speaker_in_text)
|
|
518
|
-
else:
|
|
519
|
-
import pysubs2
|
|
520
|
-
|
|
521
|
-
subs = pysubs2.SSAFile()
|
|
522
|
-
for sup in alignments:
|
|
523
|
-
# Add word-level timing as metadata in the caption text
|
|
524
|
-
word_items = cls._parse_alignment_from_supervision(sup)
|
|
525
|
-
if word_items:
|
|
526
|
-
for word in word_items:
|
|
527
|
-
subs.append(
|
|
528
|
-
pysubs2.SSAEvent(
|
|
529
|
-
start=int(word.start * 1000),
|
|
530
|
-
end=int(word.end * 1000),
|
|
531
|
-
text=word.symbol,
|
|
532
|
-
name=sup.speaker or "",
|
|
533
|
-
)
|
|
534
|
-
)
|
|
535
|
-
else:
|
|
536
|
-
if include_speaker_in_text and sup.speaker is not None:
|
|
537
|
-
if not sup.has_custom("original_speaker") or sup.custom["original_speaker"]:
|
|
538
|
-
text = f"{sup.speaker} {sup.text}"
|
|
539
|
-
else:
|
|
540
|
-
text = f"{sup.text}"
|
|
541
|
-
else:
|
|
542
|
-
text = sup.text
|
|
543
|
-
subs.append(
|
|
544
|
-
pysubs2.SSAEvent(
|
|
545
|
-
start=int(sup.start * 1000),
|
|
546
|
-
end=int(sup.end * 1000),
|
|
547
|
-
text=text or "",
|
|
548
|
-
name=sup.speaker or "",
|
|
549
|
-
)
|
|
550
|
-
)
|
|
551
|
-
|
|
552
|
-
# MicroDVD format requires framerate to be specified
|
|
553
|
-
output_ext = str(output_path).lower().split(".")[-1]
|
|
554
|
-
if output_ext == "sub":
|
|
555
|
-
# Default to 25 fps for MicroDVD format if not specified
|
|
556
|
-
subs.save(output_path, fps=25.0)
|
|
557
|
-
else:
|
|
558
|
-
subs.save(output_path)
|
|
559
|
-
|
|
560
|
-
return output_path
|
|
561
|
-
|
|
562
|
-
@classmethod
|
|
563
|
-
def _extract_metadata(cls, caption: Pathlike, format: Optional[str]) -> Dict[str, str]:
|
|
564
|
-
"""
|
|
565
|
-
Extract metadata from caption file header.
|
|
566
|
-
|
|
567
|
-
Args:
|
|
568
|
-
caption: Caption file path or content
|
|
569
|
-
format: Caption format
|
|
570
|
-
|
|
571
|
-
Returns:
|
|
572
|
-
Dictionary of metadata key-value pairs
|
|
573
|
-
"""
|
|
574
|
-
metadata = {}
|
|
575
|
-
caption_path = Path(str(caption))
|
|
576
|
-
|
|
577
|
-
if not caption_path.exists():
|
|
578
|
-
return metadata
|
|
579
|
-
|
|
580
|
-
try:
|
|
581
|
-
with open(caption_path, "r", encoding="utf-8") as f:
|
|
582
|
-
content = f.read(2048) # Read first 2KB for metadata
|
|
583
|
-
|
|
584
|
-
# WebVTT metadata extraction
|
|
585
|
-
if format == "vtt" or content.startswith("WEBVTT"):
|
|
586
|
-
lines = content.split("\n")
|
|
587
|
-
for line in lines[:10]: # Check first 10 lines
|
|
588
|
-
line = line.strip()
|
|
589
|
-
if line.startswith("Kind:"):
|
|
590
|
-
metadata["kind"] = line.split(":", 1)[1].strip()
|
|
591
|
-
elif line.startswith("Language:"):
|
|
592
|
-
metadata["language"] = line.split(":", 1)[1].strip()
|
|
593
|
-
elif line.startswith("NOTE"):
|
|
594
|
-
# Extract metadata from NOTE comments
|
|
595
|
-
match = re.search(r"NOTE\s+(\w+):\s*(.+)", line)
|
|
596
|
-
if match:
|
|
597
|
-
key, value = match.groups()
|
|
598
|
-
metadata[key.lower()] = value.strip()
|
|
599
|
-
|
|
600
|
-
# SRT doesn't have standard metadata, but check for BOM
|
|
601
|
-
elif format == "srt":
|
|
602
|
-
if content.startswith("\ufeff"):
|
|
603
|
-
metadata["encoding"] = "utf-8-sig"
|
|
604
|
-
|
|
605
|
-
# TextGrid metadata
|
|
606
|
-
elif format == "textgrid" or caption_path.suffix.lower() == ".textgrid":
|
|
607
|
-
match = re.search(r"xmin\s*=\s*([\d.]+)", content)
|
|
608
|
-
if match:
|
|
609
|
-
metadata["xmin"] = match.group(1)
|
|
610
|
-
match = re.search(r"xmax\s*=\s*([\d.]+)", content)
|
|
611
|
-
if match:
|
|
612
|
-
metadata["xmax"] = match.group(1)
|
|
613
|
-
|
|
614
|
-
except Exception:
|
|
615
|
-
# If metadata extraction fails, continue with empty metadata
|
|
616
|
-
pass
|
|
617
|
-
|
|
618
|
-
return metadata
|
|
619
|
-
|
|
620
|
-
@classmethod
|
|
621
|
-
def _parse_youtube_vtt_with_word_timestamps(
|
|
622
|
-
cls, content: str, normalize_text: Optional[bool] = False
|
|
623
|
-
) -> List[Supervision]:
|
|
624
|
-
"""
|
|
625
|
-
Parse YouTube VTT format with word-level timestamps.
|
|
626
|
-
|
|
627
|
-
YouTube auto-generated captions use this format:
|
|
628
|
-
Word1<00:00:10.559><c> Word2</c><00:00:11.120><c> Word3</c>...
|
|
629
|
-
|
|
630
|
-
Args:
|
|
631
|
-
content: VTT file content
|
|
632
|
-
normalize_text: Whether to normalize text
|
|
633
|
-
|
|
634
|
-
Returns:
|
|
635
|
-
List of Supervision objects with word-level alignments
|
|
636
|
-
"""
|
|
637
|
-
from lhotse.supervision import AlignmentItem
|
|
638
|
-
|
|
639
|
-
supervisions = []
|
|
640
|
-
|
|
641
|
-
# Pattern to match timestamp lines: 00:00:14.280 --> 00:00:17.269 align:start position:0%
|
|
642
|
-
timestamp_pattern = re.compile(r"(\d{2}:\d{2}:\d{2}[.,]\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}[.,]\d{3})")
|
|
643
|
-
|
|
644
|
-
# Pattern to match word-level timestamps: <00:00:10.559><c> word</c>
|
|
645
|
-
word_timestamp_pattern = re.compile(r"<(\d{2}:\d{2}:\d{2}[.,]\d{3})><c>\s*([^<]+)</c>")
|
|
646
|
-
|
|
647
|
-
# Pattern to match the first word (before first timestamp)
|
|
648
|
-
first_word_pattern = re.compile(r"^([^<\n]+?)<(\d{2}:\d{2}:\d{2}[.,]\d{3})>")
|
|
649
|
-
|
|
650
|
-
def parse_timestamp(ts: str) -> float:
|
|
651
|
-
"""Convert timestamp string to seconds."""
|
|
652
|
-
ts = ts.replace(",", ".")
|
|
653
|
-
parts = ts.split(":")
|
|
654
|
-
hours = int(parts[0])
|
|
655
|
-
minutes = int(parts[1])
|
|
656
|
-
seconds = float(parts[2])
|
|
657
|
-
return hours * 3600 + minutes * 60 + seconds
|
|
658
|
-
|
|
659
|
-
lines = content.split("\n")
|
|
660
|
-
i = 0
|
|
661
|
-
while i < len(lines):
|
|
662
|
-
line = lines[i].strip()
|
|
663
|
-
|
|
664
|
-
# Look for timestamp line
|
|
665
|
-
ts_match = timestamp_pattern.search(line)
|
|
666
|
-
if ts_match:
|
|
667
|
-
cue_start = parse_timestamp(ts_match.group(1))
|
|
668
|
-
cue_end = parse_timestamp(ts_match.group(2))
|
|
669
|
-
|
|
670
|
-
# Read the next non-empty lines for cue content
|
|
671
|
-
cue_lines = []
|
|
672
|
-
i += 1
|
|
673
|
-
while i < len(lines) and lines[i].strip() and not timestamp_pattern.search(lines[i]):
|
|
674
|
-
cue_lines.append(lines[i])
|
|
675
|
-
i += 1
|
|
676
|
-
|
|
677
|
-
# Process cue content
|
|
678
|
-
for cue_line in cue_lines:
|
|
679
|
-
cue_line = cue_line.strip()
|
|
680
|
-
if not cue_line:
|
|
681
|
-
continue
|
|
682
|
-
|
|
683
|
-
# Check if this line has word-level timestamps
|
|
684
|
-
word_matches = word_timestamp_pattern.findall(cue_line)
|
|
685
|
-
if word_matches:
|
|
686
|
-
# This line has word-level timing
|
|
687
|
-
word_alignments = []
|
|
688
|
-
|
|
689
|
-
# Get the first word (before the first timestamp)
|
|
690
|
-
first_match = first_word_pattern.match(cue_line)
|
|
691
|
-
if first_match:
|
|
692
|
-
first_word = first_match.group(1).strip()
|
|
693
|
-
first_word_next_ts = parse_timestamp(first_match.group(2))
|
|
694
|
-
if first_word:
|
|
695
|
-
# First word starts at cue_start
|
|
696
|
-
word_alignments.append(
|
|
697
|
-
AlignmentItem(
|
|
698
|
-
symbol=first_word,
|
|
699
|
-
start=cue_start,
|
|
700
|
-
duration=first_word_next_ts - cue_start,
|
|
701
|
-
)
|
|
702
|
-
)
|
|
703
|
-
|
|
704
|
-
# Process remaining words with timestamps
|
|
705
|
-
for idx, (ts, word) in enumerate(word_matches):
|
|
706
|
-
word_start = parse_timestamp(ts)
|
|
707
|
-
word = word.strip()
|
|
708
|
-
if not word:
|
|
709
|
-
continue
|
|
710
|
-
|
|
711
|
-
# Calculate duration based on next word's timestamp or cue end
|
|
712
|
-
if idx + 1 < len(word_matches):
|
|
713
|
-
next_ts = parse_timestamp(word_matches[idx + 1][0])
|
|
714
|
-
duration = next_ts - word_start
|
|
715
|
-
else:
|
|
716
|
-
duration = cue_end - word_start
|
|
717
|
-
|
|
718
|
-
word_alignments.append(
|
|
719
|
-
AlignmentItem(
|
|
720
|
-
symbol=word,
|
|
721
|
-
start=word_start,
|
|
722
|
-
duration=max(0.01, duration), # Ensure positive duration
|
|
723
|
-
)
|
|
724
|
-
)
|
|
725
|
-
|
|
726
|
-
if word_alignments:
|
|
727
|
-
# Create supervision with word-level alignment
|
|
728
|
-
full_text = " ".join(item.symbol for item in word_alignments)
|
|
729
|
-
if normalize_text:
|
|
730
|
-
full_text = normalize_text_fn(full_text)
|
|
731
|
-
|
|
732
|
-
sup_start = word_alignments[0].start
|
|
733
|
-
sup_end = word_alignments[-1].start + word_alignments[-1].duration
|
|
734
|
-
|
|
735
|
-
supervisions.append(
|
|
736
|
-
Supervision(
|
|
737
|
-
text=full_text,
|
|
738
|
-
start=sup_start,
|
|
739
|
-
duration=sup_end - sup_start,
|
|
740
|
-
alignment={"word": word_alignments},
|
|
741
|
-
)
|
|
742
|
-
)
|
|
743
|
-
else:
|
|
744
|
-
# Plain text line without word-level timing - skip duplicate lines
|
|
745
|
-
# (YouTube VTT often repeats the previous line without timestamps)
|
|
746
|
-
pass
|
|
747
|
-
|
|
748
|
-
continue
|
|
749
|
-
i += 1
|
|
750
|
-
|
|
751
|
-
# Merge consecutive supervisions to form complete utterances
|
|
752
|
-
if supervisions:
|
|
753
|
-
supervisions = cls._merge_youtube_vtt_supervisions(supervisions)
|
|
754
|
-
|
|
755
|
-
return supervisions
|
|
756
|
-
|
|
757
|
-
@classmethod
|
|
758
|
-
def _merge_youtube_vtt_supervisions(cls, supervisions: List[Supervision]) -> List[Supervision]:
|
|
759
|
-
"""
|
|
760
|
-
Merge consecutive YouTube VTT supervisions into complete utterances.
|
|
761
|
-
|
|
762
|
-
YouTube VTT splits utterances across multiple cues. This method merges
|
|
763
|
-
cues that are close together in time.
|
|
764
|
-
|
|
765
|
-
Args:
|
|
766
|
-
supervisions: List of supervisions to merge
|
|
767
|
-
|
|
768
|
-
Returns:
|
|
769
|
-
List of merged supervisions
|
|
770
|
-
"""
|
|
771
|
-
if not supervisions:
|
|
772
|
-
return supervisions
|
|
773
|
-
|
|
774
|
-
merged = []
|
|
775
|
-
current = supervisions[0]
|
|
776
|
-
|
|
777
|
-
for next_sup in supervisions[1:]:
|
|
778
|
-
# Check if next supervision is close enough to merge (within 0.5 seconds)
|
|
779
|
-
gap = next_sup.start - (current.start + current.duration)
|
|
780
|
-
|
|
781
|
-
if gap < 0.5 and current.alignment and next_sup.alignment:
|
|
782
|
-
# Merge alignments
|
|
783
|
-
current_words = current.alignment.get("word", [])
|
|
784
|
-
next_words = next_sup.alignment.get("word", [])
|
|
785
|
-
merged_words = list(current_words) + list(next_words)
|
|
786
|
-
|
|
787
|
-
# Create merged supervision
|
|
788
|
-
merged_text = current.text + " " + next_sup.text
|
|
789
|
-
merged_end = next_sup.start + next_sup.duration
|
|
790
|
-
|
|
791
|
-
current = Supervision(
|
|
792
|
-
text=merged_text,
|
|
793
|
-
start=current.start,
|
|
794
|
-
duration=merged_end - current.start,
|
|
795
|
-
alignment={"word": merged_words},
|
|
796
|
-
)
|
|
797
|
-
else:
|
|
798
|
-
merged.append(current)
|
|
799
|
-
current = next_sup
|
|
800
|
-
|
|
801
|
-
merged.append(current)
|
|
802
|
-
return merged
|
|
803
|
-
|
|
804
|
-
@classmethod
|
|
805
|
-
def _is_youtube_vtt_with_word_timestamps(cls, content: str) -> bool:
|
|
806
|
-
"""
|
|
807
|
-
Check if content is YouTube VTT format with word-level timestamps.
|
|
808
|
-
|
|
809
|
-
Args:
|
|
810
|
-
content: File content to check
|
|
811
|
-
|
|
812
|
-
Returns:
|
|
813
|
-
True if content contains YouTube-style word timestamps
|
|
814
|
-
"""
|
|
815
|
-
# Look for pattern like <00:00:10.559><c> word</c>
|
|
816
|
-
return bool(re.search(r"<\d{2}:\d{2}:\d{2}[.,]\d{3}><c>", content))
|
|
817
|
-
|
|
818
|
-
@classmethod
|
|
819
|
-
def _parse_supervisions(
|
|
820
|
-
cls, caption: Pathlike, format: Optional[str], normalize_text: Optional[bool] = False
|
|
821
|
-
) -> List[Supervision]:
|
|
822
|
-
"""
|
|
823
|
-
Parse supervisions from caption file.
|
|
824
|
-
|
|
825
|
-
Args:
|
|
826
|
-
caption: Caption file path or content
|
|
827
|
-
format: Caption format
|
|
828
|
-
normalize_text: Whether to normalize text
|
|
829
|
-
|
|
830
|
-
Returns:
|
|
831
|
-
List of Supervision objects
|
|
832
|
-
"""
|
|
833
|
-
if format:
|
|
834
|
-
format = format.lower()
|
|
835
|
-
|
|
836
|
-
# Check for YouTube VTT with word-level timestamps first
|
|
837
|
-
caption_path = Path(str(caption))
|
|
838
|
-
if caption_path.exists():
|
|
839
|
-
with open(caption_path, "r", encoding="utf-8") as f:
|
|
840
|
-
content = f.read()
|
|
841
|
-
if cls._is_youtube_vtt_with_word_timestamps(content):
|
|
842
|
-
return cls._parse_youtube_vtt_with_word_timestamps(content, normalize_text)
|
|
843
|
-
|
|
844
|
-
# Match Gemini format: explicit format, or files ending with Gemini.md/Gemini3.md,
|
|
845
|
-
# or files containing "gemini" in the name with .md extension
|
|
846
|
-
caption_str = str(caption).lower()
|
|
847
|
-
is_gemini_format = (
|
|
848
|
-
format == "gemini"
|
|
849
|
-
or str(caption).endswith("Gemini.md")
|
|
850
|
-
or str(caption).endswith("Gemini3.md")
|
|
851
|
-
or ("gemini" in caption_str and caption_str.endswith(".md"))
|
|
852
|
-
)
|
|
853
|
-
if is_gemini_format:
|
|
854
|
-
from .gemini_reader import GeminiReader
|
|
855
|
-
|
|
856
|
-
supervisions = GeminiReader.extract_for_alignment(caption)
|
|
857
|
-
elif format and (format == "textgrid" or str(caption).lower().endswith("textgrid")):
|
|
858
|
-
# Internel usage
|
|
859
|
-
from tgt import read_textgrid
|
|
860
|
-
|
|
861
|
-
tgt = read_textgrid(caption)
|
|
862
|
-
supervisions = []
|
|
863
|
-
for tier in tgt.tiers:
|
|
864
|
-
supervisions.extend(
|
|
865
|
-
[
|
|
866
|
-
Supervision(
|
|
867
|
-
text=interval.text,
|
|
868
|
-
start=interval.start_time,
|
|
869
|
-
duration=interval.end_time - interval.start_time,
|
|
870
|
-
speaker=tier.name,
|
|
871
|
-
)
|
|
872
|
-
for interval in tier.intervals
|
|
873
|
-
]
|
|
874
|
-
)
|
|
875
|
-
supervisions = sorted(supervisions, key=lambda x: x.start)
|
|
876
|
-
elif format == "tsv" or str(caption)[-4:].lower() == ".tsv":
|
|
877
|
-
supervisions = cls._parse_tsv(caption, normalize_text)
|
|
878
|
-
elif format == "csv" or str(caption)[-4:].lower() == ".csv":
|
|
879
|
-
supervisions = cls._parse_csv(caption, normalize_text)
|
|
880
|
-
elif format == "aud" or str(caption)[-4:].lower() == ".aud":
|
|
881
|
-
supervisions = cls._parse_aud(caption, normalize_text)
|
|
882
|
-
elif format == "sbv" or str(caption)[-4:].lower() == ".sbv":
|
|
883
|
-
supervisions = cls._parse_sbv(caption, normalize_text)
|
|
884
|
-
elif format == "txt" or (format == "auto" and str(caption)[-4:].lower() == ".txt"):
|
|
885
|
-
if not Path(str(caption)).exists(): # str
|
|
886
|
-
lines = [line.strip() for line in str(caption).split("\n")]
|
|
887
|
-
else: # file
|
|
888
|
-
path_str = str(caption)
|
|
889
|
-
with open(path_str, encoding="utf-8") as f:
|
|
890
|
-
lines = [line.strip() for line in f.readlines()]
|
|
891
|
-
if normalize_text:
|
|
892
|
-
lines = [normalize_text_fn(line) for line in lines]
|
|
893
|
-
supervisions = []
|
|
894
|
-
for line in lines:
|
|
895
|
-
if line:
|
|
896
|
-
# First try to parse timestamp format: [start-end] text
|
|
897
|
-
start, end, remaining_text = parse_timestamp_text(line)
|
|
898
|
-
if start is not None and end is not None:
|
|
899
|
-
# Has timestamp, now check for speaker in the remaining text
|
|
900
|
-
speaker, text = parse_speaker_text(remaining_text)
|
|
901
|
-
supervisions.append(
|
|
902
|
-
Supervision(
|
|
903
|
-
text=text,
|
|
904
|
-
start=start,
|
|
905
|
-
duration=end - start,
|
|
906
|
-
speaker=speaker,
|
|
907
|
-
)
|
|
908
|
-
)
|
|
909
|
-
else:
|
|
910
|
-
# No timestamp, just parse speaker and text
|
|
911
|
-
speaker, text = parse_speaker_text(line)
|
|
912
|
-
supervisions.append(Supervision(text=text, speaker=speaker))
|
|
913
|
-
else:
|
|
914
|
-
try:
|
|
915
|
-
supervisions = cls._parse_caption(caption, format=format, normalize_text=normalize_text)
|
|
916
|
-
except Exception as e:
|
|
917
|
-
print(f"Failed to parse caption with Format: {format}, Exception: {e}, trying 'gemini' parser.")
|
|
918
|
-
from .gemini_reader import GeminiReader
|
|
919
|
-
|
|
920
|
-
supervisions = GeminiReader.extract_for_alignment(caption)
|
|
921
|
-
|
|
922
|
-
return supervisions
|
|
923
|
-
|
|
924
|
-
@classmethod
|
|
925
|
-
def _parse_tsv(cls, caption: Pathlike, normalize_text: Optional[bool] = False) -> List[Supervision]:
|
|
926
|
-
"""
|
|
927
|
-
Parse TSV (Tab-Separated Values) format caption file.
|
|
928
|
-
|
|
929
|
-
Format specifications:
|
|
930
|
-
- With speaker: speaker\tstart\tend\ttext
|
|
931
|
-
- Without speaker: start\tend\ttext
|
|
932
|
-
- Times are in milliseconds
|
|
933
|
-
|
|
934
|
-
Args:
|
|
935
|
-
caption: Caption file path
|
|
936
|
-
normalize_text: Whether to normalize text
|
|
937
|
-
|
|
938
|
-
Returns:
|
|
939
|
-
List of Supervision objects
|
|
940
|
-
"""
|
|
941
|
-
caption_path = Path(str(caption))
|
|
942
|
-
if not caption_path.exists():
|
|
943
|
-
raise FileNotFoundError(f"Caption file not found: {caption}")
|
|
944
|
-
|
|
945
|
-
supervisions = []
|
|
946
|
-
|
|
947
|
-
with open(caption_path, "r", encoding="utf-8") as f:
|
|
948
|
-
lines = f.readlines()
|
|
949
|
-
|
|
950
|
-
# Check if first line is a header
|
|
951
|
-
first_line = lines[0].strip().lower()
|
|
952
|
-
has_header = "start" in first_line and "end" in first_line and "text" in first_line
|
|
953
|
-
has_speaker_column = "speaker" in first_line
|
|
954
|
-
|
|
955
|
-
start_idx = 1 if has_header else 0
|
|
956
|
-
|
|
957
|
-
for line in lines[start_idx:]:
|
|
958
|
-
line = line.strip()
|
|
959
|
-
if not line:
|
|
960
|
-
continue
|
|
961
|
-
|
|
962
|
-
parts = line.split("\t")
|
|
963
|
-
if len(parts) < 3:
|
|
964
|
-
continue
|
|
965
|
-
|
|
966
|
-
try:
|
|
967
|
-
if has_speaker_column and len(parts) >= 4:
|
|
968
|
-
# Format: speaker\tstart\tend\ttext
|
|
969
|
-
speaker = parts[0].strip() if parts[0].strip() else None
|
|
970
|
-
start = float(parts[1]) / 1000.0 # Convert milliseconds to seconds
|
|
971
|
-
end = float(parts[2]) / 1000.0
|
|
972
|
-
text = "\t".join(parts[3:]).strip()
|
|
973
|
-
else:
|
|
974
|
-
# Format: start\tend\ttext
|
|
975
|
-
start = float(parts[0]) / 1000.0 # Convert milliseconds to seconds
|
|
976
|
-
end = float(parts[1]) / 1000.0
|
|
977
|
-
text = "\t".join(parts[2:]).strip()
|
|
978
|
-
speaker = None
|
|
979
|
-
|
|
980
|
-
if normalize_text:
|
|
981
|
-
text = normalize_text_fn(text)
|
|
982
|
-
|
|
983
|
-
duration = end - start
|
|
984
|
-
if duration < 0:
|
|
985
|
-
continue
|
|
986
|
-
|
|
987
|
-
supervisions.append(
|
|
988
|
-
Supervision(
|
|
989
|
-
text=text,
|
|
990
|
-
start=start,
|
|
991
|
-
duration=duration,
|
|
992
|
-
speaker=speaker,
|
|
993
|
-
)
|
|
994
|
-
)
|
|
995
|
-
except (ValueError, IndexError):
|
|
996
|
-
# Skip malformed lines
|
|
997
|
-
continue
|
|
998
|
-
|
|
999
|
-
return supervisions
|
|
1000
|
-
|
|
1001
|
-
@classmethod
|
|
1002
|
-
def _parse_csv(cls, caption: Pathlike, normalize_text: Optional[bool] = False) -> List[Supervision]:
|
|
1003
|
-
"""
|
|
1004
|
-
Parse CSV (Comma-Separated Values) format caption file.
|
|
1005
|
-
|
|
1006
|
-
Format specifications:
|
|
1007
|
-
- With speaker: speaker,start,end,text
|
|
1008
|
-
- Without speaker: start,end,text
|
|
1009
|
-
- Times are in milliseconds
|
|
1010
|
-
|
|
1011
|
-
Args:
|
|
1012
|
-
caption: Caption file path
|
|
1013
|
-
normalize_text: Whether to normalize text
|
|
1014
|
-
|
|
1015
|
-
Returns:
|
|
1016
|
-
List of Supervision objects
|
|
1017
|
-
"""
|
|
1018
|
-
import csv
|
|
1019
|
-
|
|
1020
|
-
caption_path = Path(str(caption))
|
|
1021
|
-
if not caption_path.exists():
|
|
1022
|
-
raise FileNotFoundError(f"Caption file not found: {caption}")
|
|
1023
|
-
|
|
1024
|
-
supervisions = []
|
|
1025
|
-
|
|
1026
|
-
with open(caption_path, "r", encoding="utf-8", newline="") as f:
|
|
1027
|
-
reader = csv.reader(f)
|
|
1028
|
-
lines = list(reader)
|
|
1029
|
-
|
|
1030
|
-
if not lines:
|
|
1031
|
-
return supervisions
|
|
1032
|
-
|
|
1033
|
-
# Check if first line is a header
|
|
1034
|
-
first_line = [col.strip().lower() for col in lines[0]]
|
|
1035
|
-
has_header = "start" in first_line and "end" in first_line and "text" in first_line
|
|
1036
|
-
has_speaker_column = "speaker" in first_line
|
|
1037
|
-
|
|
1038
|
-
start_idx = 1 if has_header else 0
|
|
1039
|
-
|
|
1040
|
-
for parts in lines[start_idx:]:
|
|
1041
|
-
if len(parts) < 3:
|
|
1042
|
-
continue
|
|
1043
|
-
|
|
1044
|
-
try:
|
|
1045
|
-
if has_speaker_column and len(parts) >= 4:
|
|
1046
|
-
# Format: speaker,start,end,text
|
|
1047
|
-
speaker = parts[0].strip() if parts[0].strip() else None
|
|
1048
|
-
start = float(parts[1]) / 1000.0 # Convert milliseconds to seconds
|
|
1049
|
-
end = float(parts[2]) / 1000.0
|
|
1050
|
-
text = ",".join(parts[3:]).strip()
|
|
1051
|
-
else:
|
|
1052
|
-
# Format: start,end,text
|
|
1053
|
-
start = float(parts[0]) / 1000.0 # Convert milliseconds to seconds
|
|
1054
|
-
end = float(parts[1]) / 1000.0
|
|
1055
|
-
text = ",".join(parts[2:]).strip()
|
|
1056
|
-
speaker = None
|
|
1057
|
-
|
|
1058
|
-
if normalize_text:
|
|
1059
|
-
text = normalize_text_fn(text)
|
|
1060
|
-
|
|
1061
|
-
duration = end - start
|
|
1062
|
-
if duration < 0:
|
|
1063
|
-
continue
|
|
1064
|
-
|
|
1065
|
-
supervisions.append(
|
|
1066
|
-
Supervision(
|
|
1067
|
-
text=text,
|
|
1068
|
-
start=start,
|
|
1069
|
-
duration=duration,
|
|
1070
|
-
speaker=speaker,
|
|
1071
|
-
)
|
|
1072
|
-
)
|
|
1073
|
-
except (ValueError, IndexError):
|
|
1074
|
-
# Skip malformed lines
|
|
1075
|
-
continue
|
|
1076
|
-
|
|
1077
|
-
return supervisions
|
|
1078
|
-
|
|
1079
|
-
@classmethod
|
|
1080
|
-
def _parse_aud(cls, caption: Pathlike, normalize_text: Optional[bool] = False) -> List[Supervision]:
|
|
1081
|
-
"""
|
|
1082
|
-
Parse AUD (Audacity Labels) format caption file.
|
|
1083
|
-
|
|
1084
|
-
Format: start\tend\t[[speaker]]text
|
|
1085
|
-
- Times are in seconds (float)
|
|
1086
|
-
- Speaker is optional and enclosed in [[brackets]]
|
|
1087
|
-
|
|
1088
|
-
Args:
|
|
1089
|
-
caption: Caption file path
|
|
1090
|
-
normalize_text: Whether to normalize text
|
|
1091
|
-
|
|
1092
|
-
Returns:
|
|
1093
|
-
List of Supervision objects
|
|
1094
|
-
"""
|
|
1095
|
-
caption_path = Path(str(caption))
|
|
1096
|
-
if not caption_path.exists():
|
|
1097
|
-
raise FileNotFoundError(f"Caption file not found: {caption}")
|
|
1098
|
-
|
|
1099
|
-
supervisions = []
|
|
1100
|
-
|
|
1101
|
-
with open(caption_path, "r", encoding="utf-8") as f:
|
|
1102
|
-
lines = f.readlines()
|
|
1103
|
-
|
|
1104
|
-
for line in lines:
|
|
1105
|
-
line = line.strip()
|
|
1106
|
-
if not line:
|
|
1107
|
-
continue
|
|
1108
|
-
|
|
1109
|
-
parts = line.split("\t")
|
|
1110
|
-
if len(parts) < 3:
|
|
1111
|
-
continue
|
|
1112
|
-
|
|
1113
|
-
try:
|
|
1114
|
-
# AUD format: start\tend\ttext (speaker in [[brackets]])
|
|
1115
|
-
start = float(parts[0])
|
|
1116
|
-
end = float(parts[1])
|
|
1117
|
-
text = "\t".join(parts[2:]).strip()
|
|
1118
|
-
|
|
1119
|
-
# Extract speaker from [[speaker]] prefix
|
|
1120
|
-
speaker = None
|
|
1121
|
-
speaker_match = re.match(r"^\[\[([^\]]+)\]\]\s*(.*)$", text)
|
|
1122
|
-
if speaker_match:
|
|
1123
|
-
speaker = speaker_match.group(1)
|
|
1124
|
-
text = speaker_match.group(2)
|
|
1125
|
-
|
|
1126
|
-
if normalize_text:
|
|
1127
|
-
text = normalize_text_fn(text)
|
|
1128
|
-
|
|
1129
|
-
duration = end - start
|
|
1130
|
-
if duration < 0:
|
|
1131
|
-
continue
|
|
1132
|
-
|
|
1133
|
-
supervisions.append(
|
|
1134
|
-
Supervision(
|
|
1135
|
-
text=text,
|
|
1136
|
-
start=start,
|
|
1137
|
-
duration=duration,
|
|
1138
|
-
speaker=speaker,
|
|
1139
|
-
)
|
|
1140
|
-
)
|
|
1141
|
-
except (ValueError, IndexError):
|
|
1142
|
-
# Skip malformed lines
|
|
1143
|
-
continue
|
|
1144
|
-
|
|
1145
|
-
return supervisions
|
|
1146
|
-
|
|
1147
|
-
@classmethod
|
|
1148
|
-
def _parse_sbv(cls, caption: Pathlike, normalize_text: Optional[bool] = False) -> List[Supervision]:
|
|
1149
|
-
"""
|
|
1150
|
-
Parse SubViewer (SBV) format caption file.
|
|
1151
|
-
|
|
1152
|
-
Format:
|
|
1153
|
-
0:00:00.000,0:00:02.000
|
|
1154
|
-
Text line 1
|
|
1155
|
-
|
|
1156
|
-
0:00:02.000,0:00:04.000
|
|
1157
|
-
Text line 2
|
|
1158
|
-
|
|
1159
|
-
Args:
|
|
1160
|
-
caption: Caption file path
|
|
1161
|
-
normalize_text: Whether to normalize text
|
|
1162
|
-
|
|
1163
|
-
Returns:
|
|
1164
|
-
List of Supervision objects
|
|
1165
|
-
"""
|
|
1166
|
-
caption_path = Path(str(caption))
|
|
1167
|
-
if not caption_path.exists():
|
|
1168
|
-
raise FileNotFoundError(f"Caption file not found: {caption}")
|
|
1169
|
-
|
|
1170
|
-
supervisions = []
|
|
1171
|
-
|
|
1172
|
-
with open(caption_path, "r", encoding="utf-8") as f:
|
|
1173
|
-
content = f.read()
|
|
1174
|
-
|
|
1175
|
-
# Split by double newlines to separate entries
|
|
1176
|
-
entries = content.strip().split("\n\n")
|
|
1177
|
-
|
|
1178
|
-
for entry in entries:
|
|
1179
|
-
lines = entry.strip().split("\n")
|
|
1180
|
-
if len(lines) < 2:
|
|
1181
|
-
continue
|
|
1182
|
-
|
|
1183
|
-
# First line: timestamp (H:MM:SS.mmm,H:MM:SS.mmm)
|
|
1184
|
-
timestamp_line = lines[0].strip()
|
|
1185
|
-
# Remaining lines: text
|
|
1186
|
-
text_lines = lines[1:]
|
|
1187
|
-
|
|
1188
|
-
try:
|
|
1189
|
-
# Parse timestamp: 0:00:00.000,0:00:02.000
|
|
1190
|
-
if "," not in timestamp_line:
|
|
1191
|
-
continue
|
|
1192
|
-
|
|
1193
|
-
start_str, end_str = timestamp_line.split(",", 1)
|
|
1194
|
-
|
|
1195
|
-
# Parse start time
|
|
1196
|
-
start_parts = start_str.strip().split(":")
|
|
1197
|
-
if len(start_parts) == 3:
|
|
1198
|
-
h, m, s = start_parts
|
|
1199
|
-
s_parts = s.split(".")
|
|
1200
|
-
start = int(h) * 3600 + int(m) * 60 + int(s_parts[0])
|
|
1201
|
-
if len(s_parts) > 1:
|
|
1202
|
-
start += int(s_parts[1]) / 1000.0
|
|
1203
|
-
else:
|
|
1204
|
-
continue
|
|
1205
|
-
|
|
1206
|
-
# Parse end time
|
|
1207
|
-
end_parts = end_str.strip().split(":")
|
|
1208
|
-
if len(end_parts) == 3:
|
|
1209
|
-
h, m, s = end_parts
|
|
1210
|
-
s_parts = s.split(".")
|
|
1211
|
-
end = int(h) * 3600 + int(m) * 60 + int(s_parts[0])
|
|
1212
|
-
if len(s_parts) > 1:
|
|
1213
|
-
end += int(s_parts[1]) / 1000.0
|
|
1214
|
-
else:
|
|
1215
|
-
continue
|
|
1216
|
-
|
|
1217
|
-
# Parse text and speaker
|
|
1218
|
-
text = " ".join(text_lines).strip()
|
|
1219
|
-
speaker, text = parse_speaker_text(text)
|
|
1220
|
-
|
|
1221
|
-
if normalize_text:
|
|
1222
|
-
text = normalize_text_fn(text)
|
|
1223
|
-
|
|
1224
|
-
duration = end - start
|
|
1225
|
-
if duration < 0:
|
|
1226
|
-
continue
|
|
1227
|
-
|
|
1228
|
-
supervisions.append(
|
|
1229
|
-
Supervision(
|
|
1230
|
-
text=text,
|
|
1231
|
-
start=start,
|
|
1232
|
-
duration=duration,
|
|
1233
|
-
speaker=speaker,
|
|
1234
|
-
)
|
|
1235
|
-
)
|
|
1236
|
-
except (ValueError, IndexError):
|
|
1237
|
-
# Skip malformed entries
|
|
1238
|
-
continue
|
|
1239
|
-
|
|
1240
|
-
return supervisions
|
|
1241
|
-
|
|
1242
|
-
@classmethod
|
|
1243
|
-
def _write_tsv(
|
|
1244
|
-
cls,
|
|
1245
|
-
alignments: List[Supervision],
|
|
1246
|
-
output_path: Pathlike,
|
|
1247
|
-
include_speaker_in_text: bool = True,
|
|
1248
|
-
) -> None:
|
|
1249
|
-
"""
|
|
1250
|
-
Write caption to TSV format.
|
|
1251
|
-
|
|
1252
|
-
Format: speaker\tstart\tend\ttext (with speaker)
|
|
1253
|
-
or: start\tend\ttext (without speaker)
|
|
1254
|
-
|
|
1255
|
-
Args:
|
|
1256
|
-
alignments: List of supervision segments to write
|
|
1257
|
-
output_path: Path to output TSV file
|
|
1258
|
-
include_speaker_in_text: Whether to include speaker column
|
|
1259
|
-
"""
|
|
1260
|
-
with open(output_path, "w", encoding="utf-8") as file:
|
|
1261
|
-
# Write header
|
|
1262
|
-
if include_speaker_in_text:
|
|
1263
|
-
file.write("speaker\tstart\tend\ttext\n")
|
|
1264
|
-
for supervision in alignments:
|
|
1265
|
-
# Respect `original_speaker` custom flag: default to True when missing
|
|
1266
|
-
include_speaker = supervision.speaker and (
|
|
1267
|
-
not supervision.has_custom("original_speaker") or supervision.custom["original_speaker"]
|
|
1268
|
-
)
|
|
1269
|
-
speaker = supervision.speaker if include_speaker else ""
|
|
1270
|
-
start_ms = round(1000 * supervision.start)
|
|
1271
|
-
end_ms = round(1000 * supervision.end)
|
|
1272
|
-
text = supervision.text.strip().replace("\t", " ")
|
|
1273
|
-
file.write(f"{speaker}\t{start_ms}\t{end_ms}\t{text}\n")
|
|
1274
|
-
else:
|
|
1275
|
-
file.write("start\tend\ttext\n")
|
|
1276
|
-
for supervision in alignments:
|
|
1277
|
-
start_ms = round(1000 * supervision.start)
|
|
1278
|
-
end_ms = round(1000 * supervision.end)
|
|
1279
|
-
text = supervision.text.strip().replace("\t", " ")
|
|
1280
|
-
file.write(f"{start_ms}\t{end_ms}\t{text}\n")
|
|
1281
|
-
|
|
1282
|
-
@classmethod
|
|
1283
|
-
def _write_csv(
|
|
1284
|
-
cls,
|
|
1285
|
-
alignments: List[Supervision],
|
|
1286
|
-
output_path: Pathlike,
|
|
1287
|
-
include_speaker_in_text: bool = True,
|
|
1288
|
-
) -> None:
|
|
1289
|
-
"""
|
|
1290
|
-
Write caption to CSV format.
|
|
1291
|
-
|
|
1292
|
-
Format: speaker,start,end,text (with speaker)
|
|
1293
|
-
or: start,end,text (without speaker)
|
|
1294
|
-
|
|
1295
|
-
Args:
|
|
1296
|
-
alignments: List of supervision segments to write
|
|
1297
|
-
output_path: Path to output CSV file
|
|
1298
|
-
include_speaker_in_text: Whether to include speaker column
|
|
1299
|
-
"""
|
|
1300
|
-
import csv
|
|
1301
|
-
|
|
1302
|
-
with open(output_path, "w", encoding="utf-8", newline="") as file:
|
|
1303
|
-
if include_speaker_in_text:
|
|
1304
|
-
writer = csv.writer(file)
|
|
1305
|
-
writer.writerow(["speaker", "start", "end", "text"])
|
|
1306
|
-
for supervision in alignments:
|
|
1307
|
-
include_speaker = supervision.speaker and (
|
|
1308
|
-
not supervision.has_custom("original_speaker") or supervision.custom["original_speaker"]
|
|
1309
|
-
)
|
|
1310
|
-
speaker = supervision.speaker if include_speaker else ""
|
|
1311
|
-
start_ms = round(1000 * supervision.start)
|
|
1312
|
-
end_ms = round(1000 * supervision.end)
|
|
1313
|
-
text = supervision.text.strip()
|
|
1314
|
-
writer.writerow([speaker, start_ms, end_ms, text])
|
|
1315
|
-
else:
|
|
1316
|
-
writer = csv.writer(file)
|
|
1317
|
-
writer.writerow(["start", "end", "text"])
|
|
1318
|
-
for supervision in alignments:
|
|
1319
|
-
start_ms = round(1000 * supervision.start)
|
|
1320
|
-
end_ms = round(1000 * supervision.end)
|
|
1321
|
-
text = supervision.text.strip()
|
|
1322
|
-
writer.writerow([start_ms, end_ms, text])
|
|
1323
|
-
|
|
1324
|
-
@classmethod
|
|
1325
|
-
def _write_aud(
|
|
1326
|
-
cls,
|
|
1327
|
-
alignments: List[Supervision],
|
|
1328
|
-
output_path: Pathlike,
|
|
1329
|
-
include_speaker_in_text: bool = True,
|
|
1330
|
-
) -> None:
|
|
1331
|
-
"""
|
|
1332
|
-
Write caption to AUD format.
|
|
1333
|
-
|
|
1334
|
-
Format: start\tend\t[[speaker]]text
|
|
1335
|
-
or: start\tend\ttext (without speaker)
|
|
1336
|
-
|
|
1337
|
-
Args:
|
|
1338
|
-
alignments: List of supervision segments to write
|
|
1339
|
-
output_path: Path to output AUD file
|
|
1340
|
-
include_speaker_in_text: Whether to include speaker in [[brackets]]
|
|
1341
|
-
"""
|
|
1342
|
-
with open(output_path, "w", encoding="utf-8") as file:
|
|
1343
|
-
for supervision in alignments:
|
|
1344
|
-
start = supervision.start
|
|
1345
|
-
end = supervision.end
|
|
1346
|
-
text = supervision.text.strip().replace("\t", " ")
|
|
1347
|
-
|
|
1348
|
-
# Respect `original_speaker` custom flag when adding speaker prefix
|
|
1349
|
-
if (
|
|
1350
|
-
include_speaker_in_text
|
|
1351
|
-
and supervision.speaker
|
|
1352
|
-
and (not supervision.has_custom("original_speaker") or supervision.custom["original_speaker"])
|
|
1353
|
-
):
|
|
1354
|
-
text = f"[[{supervision.speaker}]]{text}"
|
|
1355
|
-
|
|
1356
|
-
file.write(f"{start}\t{end}\t{text}\n")
|
|
1357
|
-
|
|
1358
|
-
@classmethod
|
|
1359
|
-
def _write_sbv(
|
|
1360
|
-
cls,
|
|
1361
|
-
alignments: List[Supervision],
|
|
1362
|
-
output_path: Pathlike,
|
|
1363
|
-
include_speaker_in_text: bool = True,
|
|
1364
|
-
) -> None:
|
|
1365
|
-
"""
|
|
1366
|
-
Write caption to SubViewer (SBV) format.
|
|
1367
|
-
|
|
1368
|
-
Format:
|
|
1369
|
-
0:00:00.000,0:00:02.000
|
|
1370
|
-
Text line 1
|
|
1371
|
-
|
|
1372
|
-
0:00:02.000,0:00:04.000
|
|
1373
|
-
Text line 2
|
|
1374
|
-
|
|
1375
|
-
Args:
|
|
1376
|
-
alignments: List of supervision segments to write
|
|
1377
|
-
output_path: Path to output SBV file
|
|
1378
|
-
include_speaker_in_text: Whether to include speaker in text
|
|
1379
|
-
"""
|
|
1380
|
-
with open(output_path, "w", encoding="utf-8") as file:
|
|
1381
|
-
for i, supervision in enumerate(alignments):
|
|
1382
|
-
# Format timestamps as H:MM:SS.mmm
|
|
1383
|
-
start_h = int(supervision.start // 3600)
|
|
1384
|
-
start_m = int((supervision.start % 3600) // 60)
|
|
1385
|
-
start_s = int(supervision.start % 60)
|
|
1386
|
-
start_ms = int((supervision.start % 1) * 1000)
|
|
1387
|
-
|
|
1388
|
-
end_h = int(supervision.end // 3600)
|
|
1389
|
-
end_m = int((supervision.end % 3600) // 60)
|
|
1390
|
-
end_s = int(supervision.end % 60)
|
|
1391
|
-
end_ms = int((supervision.end % 1) * 1000)
|
|
1392
|
-
|
|
1393
|
-
start_time = f"{start_h}:{start_m:02d}:{start_s:02d}.{start_ms:03d}"
|
|
1394
|
-
end_time = f"{end_h}:{end_m:02d}:{end_s:02d}.{end_ms:03d}"
|
|
1395
|
-
|
|
1396
|
-
# Write timestamp line
|
|
1397
|
-
file.write(f"{start_time},{end_time}\n")
|
|
1398
|
-
|
|
1399
|
-
# Write text (with optional speaker). Respect `original_speaker` custom flag.
|
|
1400
|
-
text = supervision.text.strip()
|
|
1401
|
-
if (
|
|
1402
|
-
include_speaker_in_text
|
|
1403
|
-
and supervision.speaker
|
|
1404
|
-
and (not supervision.has_custom("original_speaker") or supervision.custom["original_speaker"])
|
|
1405
|
-
):
|
|
1406
|
-
text = f"{supervision.speaker}: {text}"
|
|
1407
|
-
|
|
1408
|
-
file.write(f"{text}\n")
|
|
1409
|
-
|
|
1410
|
-
# Add blank line between entries (except after last one)
|
|
1411
|
-
if i < len(alignments) - 1:
|
|
1412
|
-
file.write("\n")
|
|
1413
|
-
|
|
1414
|
-
@classmethod
|
|
1415
|
-
def _parse_caption(
|
|
1416
|
-
cls, caption: Pathlike, format: Optional[OutputCaptionFormat], normalize_text: Optional[bool] = False
|
|
1417
|
-
) -> List[Supervision]:
|
|
1418
|
-
"""
|
|
1419
|
-
Parse caption using pysubs2.
|
|
1420
|
-
|
|
1421
|
-
Args:
|
|
1422
|
-
caption: Caption file path or content
|
|
1423
|
-
format: Caption format
|
|
1424
|
-
normalize_text: Whether to normalize text
|
|
1425
|
-
|
|
1426
|
-
Returns:
|
|
1427
|
-
List of Supervision objects
|
|
1428
|
-
"""
|
|
1429
|
-
import pysubs2
|
|
1430
|
-
|
|
1431
|
-
try:
|
|
1432
|
-
subs: pysubs2.SSAFile = pysubs2.load(
|
|
1433
|
-
caption, encoding="utf-8", format_=format if format != "auto" else None
|
|
1434
|
-
) # file
|
|
1435
|
-
except IOError:
|
|
1436
|
-
try:
|
|
1437
|
-
subs: pysubs2.SSAFile = pysubs2.SSAFile.from_string(
|
|
1438
|
-
caption, format_=format if format != "auto" else None
|
|
1439
|
-
) # str
|
|
1440
|
-
except Exception as e:
|
|
1441
|
-
del e
|
|
1442
|
-
subs: pysubs2.SSAFile = pysubs2.load(caption, encoding="utf-8") # auto detect format
|
|
1443
|
-
|
|
1444
|
-
# Parse supervisions
|
|
1445
|
-
supervisions = []
|
|
1446
|
-
for event in subs.events:
|
|
1447
|
-
if normalize_text:
|
|
1448
|
-
event.text = normalize_text_fn(event.text)
|
|
1449
|
-
speaker, text = parse_speaker_text(event.text)
|
|
1450
|
-
supervisions.append(
|
|
1451
|
-
Supervision(
|
|
1452
|
-
text=text,
|
|
1453
|
-
speaker=speaker or event.name,
|
|
1454
|
-
start=event.start / 1000.0 if event.start is not None else None,
|
|
1455
|
-
duration=(event.end - event.start) / 1000.0 if event.end is not None else None,
|
|
1456
|
-
)
|
|
1457
|
-
)
|
|
1458
|
-
return supervisions
|
|
1459
|
-
|
|
1460
652
|
def __repr__(self) -> str:
|
|
1461
653
|
"""String representation of Caption."""
|
|
1462
654
|
lang = f"lang={self.language}" if self.language else "lang=unknown"
|