lattifai 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. lattifai/_init.py +20 -0
  2. lattifai/alignment/__init__.py +9 -1
  3. lattifai/alignment/lattice1_aligner.py +175 -54
  4. lattifai/alignment/lattice1_worker.py +47 -4
  5. lattifai/alignment/punctuation.py +38 -0
  6. lattifai/alignment/segmenter.py +3 -2
  7. lattifai/alignment/text_align.py +441 -0
  8. lattifai/alignment/tokenizer.py +134 -65
  9. lattifai/audio2.py +162 -183
  10. lattifai/cli/__init__.py +2 -1
  11. lattifai/cli/alignment.py +5 -0
  12. lattifai/cli/caption.py +111 -4
  13. lattifai/cli/transcribe.py +2 -6
  14. lattifai/cli/youtube.py +7 -1
  15. lattifai/client.py +72 -123
  16. lattifai/config/__init__.py +28 -0
  17. lattifai/config/alignment.py +14 -0
  18. lattifai/config/caption.py +45 -31
  19. lattifai/config/client.py +16 -0
  20. lattifai/config/event.py +102 -0
  21. lattifai/config/media.py +20 -0
  22. lattifai/config/transcription.py +25 -1
  23. lattifai/data/__init__.py +8 -0
  24. lattifai/data/caption.py +228 -0
  25. lattifai/diarization/__init__.py +41 -1
  26. lattifai/errors.py +78 -53
  27. lattifai/event/__init__.py +65 -0
  28. lattifai/event/lattifai.py +166 -0
  29. lattifai/mixin.py +49 -32
  30. lattifai/transcription/base.py +8 -2
  31. lattifai/transcription/gemini.py +147 -16
  32. lattifai/transcription/lattifai.py +25 -63
  33. lattifai/types.py +1 -1
  34. lattifai/utils.py +7 -13
  35. lattifai/workflow/__init__.py +28 -4
  36. lattifai/workflow/file_manager.py +2 -5
  37. lattifai/youtube/__init__.py +43 -0
  38. lattifai/youtube/client.py +1265 -0
  39. lattifai/youtube/types.py +23 -0
  40. lattifai-1.3.0.dist-info/METADATA +678 -0
  41. lattifai-1.3.0.dist-info/RECORD +57 -0
  42. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/entry_points.txt +1 -2
  43. lattifai/__init__.py +0 -88
  44. lattifai/alignment/sentence_splitter.py +0 -219
  45. lattifai/caption/__init__.py +0 -20
  46. lattifai/caption/caption.py +0 -1467
  47. lattifai/caption/gemini_reader.py +0 -462
  48. lattifai/caption/gemini_writer.py +0 -173
  49. lattifai/caption/supervision.py +0 -34
  50. lattifai/caption/text_parser.py +0 -145
  51. lattifai/cli/app_installer.py +0 -142
  52. lattifai/cli/server.py +0 -44
  53. lattifai/server/app.py +0 -427
  54. lattifai/workflow/youtube.py +0 -577
  55. lattifai-1.2.1.dist-info/METADATA +0 -1134
  56. lattifai-1.2.1.dist-info/RECORD +0 -58
  57. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/WHEEL +0 -0
  58. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/licenses/LICENSE +0 -0
  59. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/top_level.txt +0 -0
@@ -1,1467 +0,0 @@
1
- """Caption data structure for storing subtitle information with metadata."""
2
-
3
- import json
4
- import re
5
- from dataclasses import dataclass, field
6
- from pathlib import Path
7
- from typing import Any, Dict, List, Optional, TypeVar
8
-
9
- from lhotse.supervision import AlignmentItem
10
- from lhotse.utils import Pathlike
11
- from tgt import TextGrid
12
-
13
- from ..config.caption import InputCaptionFormat, OutputCaptionFormat # noqa: F401
14
- from .supervision import Supervision
15
- from .text_parser import normalize_text as normalize_text_fn
16
- from .text_parser import parse_speaker_text, parse_timestamp_text
17
-
18
- DiarizationOutput = TypeVar("DiarizationOutput")
19
-
20
-
21
- @dataclass
22
- class Caption:
23
- """
24
- Container for caption/subtitle data with metadata.
25
-
26
- This class encapsulates a list of supervisions (subtitle segments) along with
27
- metadata such as language, kind, format information, and source file details.
28
-
29
- Attributes:
30
- supervisions: List of supervision segments containing text and timing information
31
- language: Language code (e.g., 'en', 'zh', 'es')
32
- kind: Caption kind/type (e.g., 'captions', 'subtitles', 'descriptions')
33
- source_format: Original format of the caption file (e.g., 'vtt', 'srt', 'json')
34
- source_path: Path to the source caption file
35
- metadata: Additional custom metadata as key-value pairs
36
- """
37
-
38
- # read from subtitle file
39
- supervisions: List[Supervision] = field(default_factory=list)
40
- # Transcription results
41
- transcription: List[Supervision] = field(default_factory=list)
42
- # Audio Event Detection results
43
- audio_events: Optional[TextGrid] = None
44
- # Speaker Diarization results
45
- speaker_diarization: Optional[DiarizationOutput] = None
46
- # Alignment results
47
- alignments: List[Supervision] = field(default_factory=list)
48
-
49
- language: Optional[str] = None
50
- kind: Optional[str] = None
51
- source_format: Optional[str] = None
52
- source_path: Optional[Pathlike] = None
53
- metadata: Dict[str, str] = field(default_factory=dict)
54
-
55
- def __len__(self) -> int:
56
- """Return the number of supervision segments."""
57
- return len(self.supervisions or self.transcription)
58
-
59
- def __iter__(self):
60
- """Iterate over supervision segments."""
61
- return iter(self.supervisions)
62
-
63
- def __getitem__(self, index):
64
- """Get supervision segment by index."""
65
- return self.supervisions[index]
66
-
67
- def __bool__(self) -> bool:
68
- """Return True if caption has supervisions."""
69
- return self.__len__() > 0
70
-
71
- @property
72
- def is_empty(self) -> bool:
73
- """Check if caption has no supervisions."""
74
- return len(self.supervisions) == 0
75
-
76
- @property
77
- def duration(self) -> Optional[float]:
78
- """
79
- Get total duration of the caption in seconds.
80
-
81
- Returns:
82
- Total duration from first to last supervision, or None if empty
83
- """
84
- if not self.supervisions:
85
- return None
86
- return self.supervisions[-1].end - self.supervisions[0].start
87
-
88
- @property
89
- def start_time(self) -> Optional[float]:
90
- """Get start time of first supervision."""
91
- if not self.supervisions:
92
- return None
93
- return self.supervisions[0].start
94
-
95
- @property
96
- def end_time(self) -> Optional[float]:
97
- """Get end time of last supervision."""
98
- if not self.supervisions:
99
- return None
100
- return self.supervisions[-1].end
101
-
102
- def append(self, supervision: Supervision) -> None:
103
- """Add a supervision segment to the caption."""
104
- self.supervisions.append(supervision)
105
-
106
- def extend(self, supervisions: List[Supervision]) -> None:
107
- """Add multiple supervision segments to the caption."""
108
- self.supervisions.extend(supervisions)
109
-
110
- def filter_by_speaker(self, speaker: str) -> "Caption":
111
- """
112
- Create a new Caption with only supervisions from a specific speaker.
113
-
114
- Args:
115
- speaker: Speaker identifier to filter by
116
-
117
- Returns:
118
- New Caption instance with filtered supervisions
119
- """
120
- filtered_sups = [sup for sup in self.supervisions if sup.speaker == speaker]
121
- return Caption(
122
- supervisions=filtered_sups,
123
- language=self.language,
124
- kind=self.kind,
125
- source_format=self.source_format,
126
- source_path=self.source_path,
127
- metadata=self.metadata.copy(),
128
- )
129
-
130
- def get_speakers(self) -> List[str]:
131
- """
132
- Get list of unique speakers in the caption.
133
-
134
- Returns:
135
- Sorted list of unique speaker identifiers
136
- """
137
- speakers = {sup.speaker for sup in self.supervisions if sup.speaker}
138
- return sorted(speakers)
139
-
140
- def shift_time(self, seconds: float) -> "Caption":
141
- """
142
- Create a new Caption with all timestamps shifted by given seconds.
143
-
144
- Args:
145
- seconds: Number of seconds to shift (positive delays, negative advances)
146
-
147
- Returns:
148
- New Caption instance with shifted timestamps
149
- """
150
- shifted_sups = [
151
- Supervision(
152
- text=sup.text,
153
- start=sup.start + seconds,
154
- duration=sup.duration,
155
- speaker=sup.speaker,
156
- id=sup.id,
157
- language=sup.language,
158
- alignment=sup.alignment if hasattr(sup, "alignment") else None,
159
- custom=sup.custom,
160
- )
161
- for sup in self.supervisions
162
- ]
163
-
164
- return Caption(
165
- supervisions=shifted_sups,
166
- language=self.language,
167
- kind=self.kind,
168
- source_format=self.source_format,
169
- source_path=self.source_path,
170
- metadata=self.metadata.copy(),
171
- )
172
-
173
- def to_string(self, format: str = "srt") -> str:
174
- """
175
- Return caption content in specified format.
176
-
177
- Args:
178
- format: Output format (e.g., 'srt', 'vtt', 'ass')
179
-
180
- Returns:
181
- String containing formatted captions
182
- """
183
- import pysubs2
184
-
185
- subs = pysubs2.SSAFile()
186
-
187
- if self.alignments:
188
- alignments = self.alignments
189
- else:
190
- alignments = self.supervisions
191
-
192
- if not alignments:
193
- alignments = self.transcription
194
-
195
- for sup in alignments:
196
- # Add word-level timing as metadata in the caption text
197
- word_items = self._parse_alignment_from_supervision(sup)
198
- if word_items:
199
- for word in word_items:
200
- subs.append(
201
- pysubs2.SSAEvent(
202
- start=int(word.start * 1000),
203
- end=int(word.end * 1000),
204
- text=word.symbol,
205
- name=sup.speaker or "",
206
- )
207
- )
208
- else:
209
- subs.append(
210
- pysubs2.SSAEvent(
211
- start=int(sup.start * 1000),
212
- end=int(sup.end * 1000),
213
- text=sup.text or "",
214
- name=sup.speaker or "",
215
- )
216
- )
217
-
218
- return subs.to_string(format_=format)
219
-
220
- def to_dict(self) -> Dict:
221
- """
222
- Convert Caption to dictionary representation.
223
-
224
- Returns:
225
- Dictionary with caption data and metadata
226
- """
227
- return {
228
- "supervisions": [sup.to_dict() for sup in self.supervisions],
229
- "language": self.language,
230
- "kind": self.kind,
231
- "source_format": self.source_format,
232
- "source_path": str(self.source_path) if self.source_path else None,
233
- "metadata": self.metadata,
234
- "duration": self.duration,
235
- "num_segments": len(self.supervisions),
236
- "speakers": self.get_speakers(),
237
- }
238
-
239
- @classmethod
240
- def from_supervisions(
241
- cls,
242
- supervisions: List[Supervision],
243
- language: Optional[str] = None,
244
- kind: Optional[str] = None,
245
- source_format: Optional[str] = None,
246
- source_path: Optional[Pathlike] = None,
247
- metadata: Optional[Dict[str, str]] = None,
248
- ) -> "Caption":
249
- """
250
- Create Caption from a list of supervisions.
251
-
252
- Args:
253
- supervisions: List of supervision segments
254
- language: Language code
255
- kind: Caption kind/type
256
- source_format: Original format
257
- source_path: Source file path
258
- metadata: Additional metadata
259
-
260
- Returns:
261
- New Caption instance
262
- """
263
- return cls(
264
- supervisions=supervisions,
265
- language=language,
266
- kind=kind,
267
- source_format=source_format,
268
- source_path=source_path,
269
- metadata=metadata or {},
270
- )
271
-
272
- @classmethod
273
- def from_transcription_results(
274
- cls,
275
- transcription: List[Supervision],
276
- audio_events: Optional[TextGrid] = None,
277
- speaker_diarization: Optional[DiarizationOutput] = None,
278
- language: Optional[str] = None,
279
- source_path: Optional[Pathlike] = None,
280
- metadata: Optional[Dict[str, str]] = None,
281
- ) -> "Caption":
282
- """
283
- Create Caption from transcription results including audio events and diarization.
284
-
285
- Args:
286
- transcription: List of transcription supervision segments
287
- audio_events: Optional TextGrid with audio event detection results
288
- speaker_diarization: Optional DiarizationOutput with speaker diarization results
289
- language: Language code
290
- source_path: Source file path
291
- metadata: Additional metadata
292
-
293
- Returns:
294
- New Caption instance with transcription data
295
- """
296
- return cls(
297
- transcription=transcription,
298
- audio_events=audio_events,
299
- speaker_diarization=speaker_diarization,
300
- language=language,
301
- kind="transcription",
302
- source_format="asr",
303
- source_path=source_path,
304
- metadata=metadata or {},
305
- )
306
-
307
- @classmethod
308
- def read(
309
- cls,
310
- path: Pathlike,
311
- format: Optional[str] = None,
312
- normalize_text: bool = True,
313
- ) -> "Caption":
314
- """
315
- Read caption file and return Caption object.
316
-
317
- Args:
318
- path: Path to caption file
319
- format: Caption format (auto-detected if not provided)
320
- normalize_text: Whether to normalize text during reading
321
-
322
- Returns:
323
- Caption object containing supervisions and metadata
324
-
325
- Example:
326
- >>> caption = Caption.read("subtitles.srt")
327
- >>> print(f"Loaded {len(caption)} segments")
328
- """
329
- caption_path = Path(str(path)) if not isinstance(path, Path) else path
330
-
331
- # Detect format if not provided
332
- if not format and caption_path.exists():
333
- format = caption_path.suffix.lstrip(".").lower()
334
- elif format:
335
- format = format.lower()
336
-
337
- # Extract metadata from file
338
- metadata = cls._extract_metadata(path, format)
339
-
340
- # Parse supervisions
341
- supervisions = cls._parse_supervisions(path, format, normalize_text)
342
-
343
- # Create Caption object
344
- return cls(
345
- supervisions=supervisions,
346
- language=metadata.get("language"),
347
- kind=metadata.get("kind"),
348
- source_format=format,
349
- source_path=str(caption_path) if caption_path.exists() else None,
350
- metadata=metadata,
351
- )
352
-
353
- def write(
354
- self,
355
- path: Pathlike,
356
- include_speaker_in_text: bool = True,
357
- ) -> Pathlike:
358
- """
359
- Write caption to file.
360
-
361
- Args:
362
- path: Path to output caption file
363
- include_speaker_in_text: Whether to include speaker labels in text
364
-
365
- Returns:
366
- Path to the written file
367
-
368
- Example:
369
- >>> caption = Caption.read("input.srt")
370
- >>> caption.write("output.vtt", include_speaker_in_text=False)
371
- """
372
- if self.alignments:
373
- alignments = self.alignments
374
- else:
375
- alignments = self.supervisions
376
-
377
- if not alignments:
378
- alignments = self.transcription
379
-
380
- return self._write_caption(alignments, path, include_speaker_in_text)
381
-
382
- def read_speaker_diarization(
383
- self,
384
- path: Pathlike,
385
- ) -> TextGrid:
386
- """
387
- Read speaker diarization TextGrid from file.
388
- """
389
- from lattifai_core.diarization import DiarizationOutput
390
-
391
- self.speaker_diarization = DiarizationOutput.read(path)
392
- return self.speaker_diarization
393
-
394
- def write_speaker_diarization(
395
- self,
396
- path: Pathlike,
397
- ) -> Pathlike:
398
- """
399
- Write speaker diarization TextGrid to file.
400
- """
401
- if not self.speaker_diarization:
402
- raise ValueError("No speaker diarization data to write.")
403
-
404
- self.speaker_diarization.write(path)
405
- return path
406
-
407
- @staticmethod
408
- def _parse_alignment_from_supervision(supervision: Any) -> Optional[List[AlignmentItem]]:
409
- """
410
- Extract word-level alignment items from Supervision object.
411
-
412
- Args:
413
- supervision: Supervision object with potential alignment data
414
-
415
- Returns:
416
- List of AlignmentItem objects, or None if no alignment data present
417
- """
418
- if not hasattr(supervision, "alignment") or not supervision.alignment:
419
- return None
420
-
421
- if "word" not in supervision.alignment:
422
- return None
423
-
424
- return supervision.alignment["word"]
425
-
426
- @classmethod
427
- def _write_caption(
428
- cls,
429
- alignments: List[Supervision],
430
- output_path: Pathlike,
431
- include_speaker_in_text: bool = True,
432
- ) -> Pathlike:
433
- """
434
- Write caption to file in various formats.
435
-
436
- Args:
437
- alignments: List of supervision segments to write
438
- output_path: Path to output file
439
- include_speaker_in_text: Whether to include speaker in text
440
-
441
- Returns:
442
- Path to written file
443
- """
444
- if str(output_path)[-4:].lower() == ".txt":
445
- with open(output_path, "w", encoding="utf-8") as f:
446
- for sup in alignments:
447
- word_items = cls._parse_alignment_from_supervision(sup)
448
- if word_items:
449
- for item in word_items:
450
- f.write(f"[{item.start:.2f}-{item.end:.2f}] {item.symbol}\n")
451
- else:
452
- if include_speaker_in_text and sup.speaker is not None:
453
- # Use [SPEAKER]: format for consistency with parsing
454
- if not sup.has_custom("original_speaker") or sup.custom["original_speaker"]:
455
- text = f"[{sup.speaker}]: {sup.text}"
456
- else:
457
- text = f"{sup.text}"
458
- else:
459
- text = sup.text
460
- f.write(f"[{sup.start:.2f}-{sup.end:.2f}] {text}\n")
461
-
462
- elif str(output_path)[-5:].lower() == ".json":
463
- with open(output_path, "w", encoding="utf-8") as f:
464
- # Enhanced JSON export with word-level alignment
465
- json_data = []
466
- for sup in alignments:
467
- sup_dict = sup.to_dict()
468
- json_data.append(sup_dict)
469
- json.dump(json_data, f, ensure_ascii=False, indent=4)
470
- elif str(output_path).lower().endswith(".textgrid"):
471
- from tgt import Interval, IntervalTier, TextGrid, write_to_file
472
-
473
- tg = TextGrid()
474
- supervisions, words, scores = [], [], {"utterances": [], "words": []}
475
- for supervision in sorted(alignments, key=lambda x: x.start):
476
- # Respect `original_speaker` custom flag: default to include speaker when missing
477
- if (
478
- include_speaker_in_text
479
- and supervision.speaker is not None
480
- and (not supervision.has_custom("original_speaker") or supervision.custom["original_speaker"])
481
- ):
482
- text = f"{supervision.speaker} {supervision.text}"
483
- else:
484
- text = supervision.text
485
- supervisions.append(Interval(supervision.start, supervision.end, text or ""))
486
- # Extract word-level alignment using helper function
487
- word_items = cls._parse_alignment_from_supervision(supervision)
488
- if word_items:
489
- for item in word_items:
490
- words.append(Interval(item.start, item.end, item.symbol))
491
- if item.score is not None:
492
- scores["words"].append(Interval(item.start, item.end, f"{item.score:.2f}"))
493
- if supervision.has_custom("score"):
494
- scores["utterances"].append(
495
- Interval(supervision.start, supervision.end, f"{supervision.score:.2f}")
496
- )
497
-
498
- tg.add_tier(IntervalTier(name="utterances", objects=supervisions))
499
- if words:
500
- tg.add_tier(IntervalTier(name="words", objects=words))
501
-
502
- if scores["utterances"]:
503
- tg.add_tier(IntervalTier(name="utterance_scores", objects=scores["utterances"]))
504
- if scores["words"]:
505
- tg.add_tier(IntervalTier(name="word_scores", objects=scores["words"]))
506
-
507
- write_to_file(tg, output_path, format="long")
508
- elif str(output_path)[-4:].lower() == ".tsv":
509
- cls._write_tsv(alignments, output_path, include_speaker_in_text)
510
- elif str(output_path)[-4:].lower() == ".csv":
511
- cls._write_csv(alignments, output_path, include_speaker_in_text)
512
- elif str(output_path)[-4:].lower() == ".aud":
513
- cls._write_aud(alignments, output_path, include_speaker_in_text)
514
- elif str(output_path)[-4:].lower() == ".sbv":
515
- cls._write_sbv(alignments, output_path, include_speaker_in_text)
516
- else:
517
- import pysubs2
518
-
519
- subs = pysubs2.SSAFile()
520
- for sup in alignments:
521
- # Add word-level timing as metadata in the caption text
522
- word_items = cls._parse_alignment_from_supervision(sup)
523
- if word_items:
524
- for word in word_items:
525
- subs.append(
526
- pysubs2.SSAEvent(
527
- start=int(word.start * 1000),
528
- end=int(word.end * 1000),
529
- text=word.symbol,
530
- name=sup.speaker or "",
531
- )
532
- )
533
- else:
534
- if include_speaker_in_text and sup.speaker is not None:
535
- if not sup.has_custom("original_speaker") or sup.custom["original_speaker"]:
536
- text = f"{sup.speaker} {sup.text}"
537
- else:
538
- text = f"{sup.text}"
539
- else:
540
- text = sup.text
541
- subs.append(
542
- pysubs2.SSAEvent(
543
- start=int(sup.start * 1000),
544
- end=int(sup.end * 1000),
545
- text=text or "",
546
- name=sup.speaker or "",
547
- )
548
- )
549
-
550
- # MicroDVD format requires framerate to be specified
551
- output_ext = str(output_path).lower().split(".")[-1]
552
- if output_ext == "sub":
553
- # Default to 25 fps for MicroDVD format if not specified
554
- subs.save(output_path, fps=25.0)
555
- else:
556
- subs.save(output_path)
557
-
558
- return output_path
559
-
560
- @classmethod
561
- def _extract_metadata(cls, caption: Pathlike, format: Optional[str]) -> Dict[str, str]:
562
- """
563
- Extract metadata from caption file header.
564
-
565
- Args:
566
- caption: Caption file path or content
567
- format: Caption format
568
-
569
- Returns:
570
- Dictionary of metadata key-value pairs
571
- """
572
- metadata = {}
573
- caption_path = Path(str(caption))
574
-
575
- if not caption_path.exists():
576
- return metadata
577
-
578
- try:
579
- with open(caption_path, "r", encoding="utf-8") as f:
580
- content = f.read(2048) # Read first 2KB for metadata
581
-
582
- # WebVTT metadata extraction
583
- if format == "vtt" or content.startswith("WEBVTT"):
584
- lines = content.split("\n")
585
- for line in lines[:10]: # Check first 10 lines
586
- line = line.strip()
587
- if line.startswith("Kind:"):
588
- metadata["kind"] = line.split(":", 1)[1].strip()
589
- elif line.startswith("Language:"):
590
- metadata["language"] = line.split(":", 1)[1].strip()
591
- elif line.startswith("NOTE"):
592
- # Extract metadata from NOTE comments
593
- match = re.search(r"NOTE\s+(\w+):\s*(.+)", line)
594
- if match:
595
- key, value = match.groups()
596
- metadata[key.lower()] = value.strip()
597
-
598
- # SRT doesn't have standard metadata, but check for BOM
599
- elif format == "srt":
600
- if content.startswith("\ufeff"):
601
- metadata["encoding"] = "utf-8-sig"
602
-
603
- # TextGrid metadata
604
- elif format == "textgrid" or caption_path.suffix.lower() == ".textgrid":
605
- match = re.search(r"xmin\s*=\s*([\d.]+)", content)
606
- if match:
607
- metadata["xmin"] = match.group(1)
608
- match = re.search(r"xmax\s*=\s*([\d.]+)", content)
609
- if match:
610
- metadata["xmax"] = match.group(1)
611
-
612
- except Exception:
613
- # If metadata extraction fails, continue with empty metadata
614
- pass
615
-
616
- return metadata
617
-
618
- @classmethod
619
- def _parse_youtube_vtt_with_word_timestamps(
620
- cls, content: str, normalize_text: Optional[bool] = False
621
- ) -> List[Supervision]:
622
- """
623
- Parse YouTube VTT format with word-level timestamps.
624
-
625
- YouTube auto-generated captions use this format:
626
- Word1<00:00:10.559><c> Word2</c><00:00:11.120><c> Word3</c>...
627
-
628
- Args:
629
- content: VTT file content
630
- normalize_text: Whether to normalize text
631
-
632
- Returns:
633
- List of Supervision objects with word-level alignments
634
- """
635
- from lhotse.supervision import AlignmentItem
636
-
637
- supervisions = []
638
-
639
- # Pattern to match timestamp lines: 00:00:14.280 --> 00:00:17.269 align:start position:0%
640
- timestamp_pattern = re.compile(r"(\d{2}:\d{2}:\d{2}[.,]\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}[.,]\d{3})")
641
-
642
- # Pattern to match word-level timestamps: <00:00:10.559><c> word</c>
643
- word_timestamp_pattern = re.compile(r"<(\d{2}:\d{2}:\d{2}[.,]\d{3})><c>\s*([^<]+)</c>")
644
-
645
- # Pattern to match the first word (before first timestamp)
646
- first_word_pattern = re.compile(r"^([^<\n]+?)<(\d{2}:\d{2}:\d{2}[.,]\d{3})>")
647
-
648
- def parse_timestamp(ts: str) -> float:
649
- """Convert timestamp string to seconds."""
650
- ts = ts.replace(",", ".")
651
- parts = ts.split(":")
652
- hours = int(parts[0])
653
- minutes = int(parts[1])
654
- seconds = float(parts[2])
655
- return hours * 3600 + minutes * 60 + seconds
656
-
657
- lines = content.split("\n")
658
- i = 0
659
- while i < len(lines):
660
- line = lines[i].strip()
661
-
662
- # Look for timestamp line
663
- ts_match = timestamp_pattern.search(line)
664
- if ts_match:
665
- cue_start = parse_timestamp(ts_match.group(1))
666
- cue_end = parse_timestamp(ts_match.group(2))
667
-
668
- # Read the next non-empty lines for cue content
669
- cue_lines = []
670
- i += 1
671
- while i < len(lines) and lines[i].strip() and not timestamp_pattern.search(lines[i]):
672
- cue_lines.append(lines[i])
673
- i += 1
674
-
675
- # Process cue content
676
- for cue_line in cue_lines:
677
- cue_line = cue_line.strip()
678
- if not cue_line:
679
- continue
680
-
681
- # Check if this line has word-level timestamps
682
- word_matches = word_timestamp_pattern.findall(cue_line)
683
- if word_matches:
684
- # This line has word-level timing
685
- word_alignments = []
686
-
687
- # Get the first word (before the first timestamp)
688
- first_match = first_word_pattern.match(cue_line)
689
- if first_match:
690
- first_word = first_match.group(1).strip()
691
- first_word_next_ts = parse_timestamp(first_match.group(2))
692
- if first_word:
693
- # First word starts at cue_start
694
- word_alignments.append(
695
- AlignmentItem(
696
- symbol=first_word,
697
- start=cue_start,
698
- duration=first_word_next_ts - cue_start,
699
- )
700
- )
701
-
702
- # Process remaining words with timestamps
703
- for idx, (ts, word) in enumerate(word_matches):
704
- word_start = parse_timestamp(ts)
705
- word = word.strip()
706
- if not word:
707
- continue
708
-
709
- # Calculate duration based on next word's timestamp or cue end
710
- if idx + 1 < len(word_matches):
711
- next_ts = parse_timestamp(word_matches[idx + 1][0])
712
- duration = next_ts - word_start
713
- else:
714
- duration = cue_end - word_start
715
-
716
- word_alignments.append(
717
- AlignmentItem(
718
- symbol=word,
719
- start=word_start,
720
- duration=max(0.01, duration), # Ensure positive duration
721
- )
722
- )
723
-
724
- if word_alignments:
725
- # Create supervision with word-level alignment
726
- full_text = " ".join(item.symbol for item in word_alignments)
727
- if normalize_text:
728
- full_text = normalize_text_fn(full_text)
729
-
730
- sup_start = word_alignments[0].start
731
- sup_end = word_alignments[-1].start + word_alignments[-1].duration
732
-
733
- supervisions.append(
734
- Supervision(
735
- text=full_text,
736
- start=sup_start,
737
- duration=sup_end - sup_start,
738
- alignment={"word": word_alignments},
739
- )
740
- )
741
- else:
742
- # Plain text line without word-level timing - skip duplicate lines
743
- # (YouTube VTT often repeats the previous line without timestamps)
744
- pass
745
-
746
- continue
747
- i += 1
748
-
749
- # Merge consecutive supervisions to form complete utterances
750
- if supervisions:
751
- supervisions = cls._merge_youtube_vtt_supervisions(supervisions)
752
-
753
- return supervisions
754
-
755
- @classmethod
756
- def _merge_youtube_vtt_supervisions(cls, supervisions: List[Supervision]) -> List[Supervision]:
757
- """
758
- Merge consecutive YouTube VTT supervisions into complete utterances.
759
-
760
- YouTube VTT splits utterances across multiple cues. This method merges
761
- cues that are close together in time.
762
-
763
- Args:
764
- supervisions: List of supervisions to merge
765
-
766
- Returns:
767
- List of merged supervisions
768
- """
769
- if not supervisions:
770
- return supervisions
771
-
772
- merged = []
773
- current = supervisions[0]
774
-
775
- for next_sup in supervisions[1:]:
776
- # Check if next supervision is close enough to merge (within 0.5 seconds)
777
- gap = next_sup.start - (current.start + current.duration)
778
-
779
- if gap < 0.5 and current.alignment and next_sup.alignment:
780
- # Merge alignments
781
- current_words = current.alignment.get("word", [])
782
- next_words = next_sup.alignment.get("word", [])
783
- merged_words = list(current_words) + list(next_words)
784
-
785
- # Create merged supervision
786
- merged_text = current.text + " " + next_sup.text
787
- merged_end = next_sup.start + next_sup.duration
788
-
789
- current = Supervision(
790
- text=merged_text,
791
- start=current.start,
792
- duration=merged_end - current.start,
793
- alignment={"word": merged_words},
794
- )
795
- else:
796
- merged.append(current)
797
- current = next_sup
798
-
799
- merged.append(current)
800
- return merged
801
-
802
- @classmethod
803
- def _is_youtube_vtt_with_word_timestamps(cls, content: str) -> bool:
804
- """
805
- Check if content is YouTube VTT format with word-level timestamps.
806
-
807
- Args:
808
- content: File content to check
809
-
810
- Returns:
811
- True if content contains YouTube-style word timestamps
812
- """
813
- # Look for pattern like <00:00:10.559><c> word</c>
814
- return bool(re.search(r"<\d{2}:\d{2}:\d{2}[.,]\d{3}><c>", content))
815
-
816
- @classmethod
817
- def _parse_supervisions(
818
- cls, caption: Pathlike, format: Optional[str], normalize_text: Optional[bool] = False
819
- ) -> List[Supervision]:
820
- """
821
- Parse supervisions from caption file.
822
-
823
- Args:
824
- caption: Caption file path or content
825
- format: Caption format
826
- normalize_text: Whether to normalize text
827
-
828
- Returns:
829
- List of Supervision objects
830
- """
831
- if format:
832
- format = format.lower()
833
-
834
- # Check for YouTube VTT with word-level timestamps first
835
- caption_path = Path(str(caption))
836
- if caption_path.exists():
837
- with open(caption_path, "r", encoding="utf-8") as f:
838
- content = f.read()
839
- if cls._is_youtube_vtt_with_word_timestamps(content):
840
- return cls._parse_youtube_vtt_with_word_timestamps(content, normalize_text)
841
-
842
- # Match Gemini format: explicit format, or files ending with Gemini.md/Gemini3.md,
843
- # or files containing "gemini" in the name with .md extension
844
- caption_str = str(caption).lower()
845
- is_gemini_format = (
846
- format == "gemini"
847
- or str(caption).endswith("Gemini.md")
848
- or str(caption).endswith("Gemini3.md")
849
- or ("gemini" in caption_str and caption_str.endswith(".md"))
850
- )
851
- if is_gemini_format:
852
- from .gemini_reader import GeminiReader
853
-
854
- supervisions = GeminiReader.extract_for_alignment(caption)
855
- elif format and (format == "textgrid" or str(caption).lower().endswith("textgrid")):
856
- # Internel usage
857
- from tgt import read_textgrid
858
-
859
- tgt = read_textgrid(caption)
860
- supervisions = []
861
- for tier in tgt.tiers:
862
- supervisions.extend(
863
- [
864
- Supervision(
865
- text=interval.text,
866
- start=interval.start_time,
867
- duration=interval.end_time - interval.start_time,
868
- speaker=tier.name,
869
- )
870
- for interval in tier.intervals
871
- ]
872
- )
873
- supervisions = sorted(supervisions, key=lambda x: x.start)
874
- elif format == "tsv" or str(caption)[-4:].lower() == ".tsv":
875
- supervisions = cls._parse_tsv(caption, normalize_text)
876
- elif format == "csv" or str(caption)[-4:].lower() == ".csv":
877
- supervisions = cls._parse_csv(caption, normalize_text)
878
- elif format == "aud" or str(caption)[-4:].lower() == ".aud":
879
- supervisions = cls._parse_aud(caption, normalize_text)
880
- elif format == "sbv" or str(caption)[-4:].lower() == ".sbv":
881
- supervisions = cls._parse_sbv(caption, normalize_text)
882
- elif format == "txt" or (format == "auto" and str(caption)[-4:].lower() == ".txt"):
883
- if not Path(str(caption)).exists(): # str
884
- lines = [line.strip() for line in str(caption).split("\n")]
885
- else: # file
886
- path_str = str(caption)
887
- with open(path_str, encoding="utf-8") as f:
888
- lines = [line.strip() for line in f.readlines()]
889
- if normalize_text:
890
- lines = [normalize_text_fn(line) for line in lines]
891
- supervisions = []
892
- for line in lines:
893
- if line:
894
- # First try to parse timestamp format: [start-end] text
895
- start, end, remaining_text = parse_timestamp_text(line)
896
- if start is not None and end is not None:
897
- # Has timestamp, now check for speaker in the remaining text
898
- speaker, text = parse_speaker_text(remaining_text)
899
- supervisions.append(
900
- Supervision(
901
- text=text,
902
- start=start,
903
- duration=end - start,
904
- speaker=speaker,
905
- )
906
- )
907
- else:
908
- # No timestamp, just parse speaker and text
909
- speaker, text = parse_speaker_text(line)
910
- supervisions.append(Supervision(text=text, speaker=speaker))
911
- else:
912
- try:
913
- supervisions = cls._parse_caption(caption, format=format, normalize_text=normalize_text)
914
- except Exception as e:
915
- print(f"Failed to parse caption with Format: {format}, Exception: {e}, trying 'gemini' parser.")
916
- from .gemini_reader import GeminiReader
917
-
918
- supervisions = GeminiReader.extract_for_alignment(caption)
919
-
920
- return supervisions
921
-
922
- @classmethod
923
- def _parse_tsv(cls, caption: Pathlike, normalize_text: Optional[bool] = False) -> List[Supervision]:
924
- """
925
- Parse TSV (Tab-Separated Values) format caption file.
926
-
927
- Format specifications:
928
- - With speaker: speaker\tstart\tend\ttext
929
- - Without speaker: start\tend\ttext
930
- - Times are in milliseconds
931
-
932
- Args:
933
- caption: Caption file path
934
- normalize_text: Whether to normalize text
935
-
936
- Returns:
937
- List of Supervision objects
938
- """
939
- caption_path = Path(str(caption))
940
- if not caption_path.exists():
941
- raise FileNotFoundError(f"Caption file not found: {caption}")
942
-
943
- supervisions = []
944
-
945
- with open(caption_path, "r", encoding="utf-8") as f:
946
- lines = f.readlines()
947
-
948
- # Check if first line is a header
949
- first_line = lines[0].strip().lower()
950
- has_header = "start" in first_line and "end" in first_line and "text" in first_line
951
- has_speaker_column = "speaker" in first_line
952
-
953
- start_idx = 1 if has_header else 0
954
-
955
- for line in lines[start_idx:]:
956
- line = line.strip()
957
- if not line:
958
- continue
959
-
960
- parts = line.split("\t")
961
- if len(parts) < 3:
962
- continue
963
-
964
- try:
965
- if has_speaker_column and len(parts) >= 4:
966
- # Format: speaker\tstart\tend\ttext
967
- speaker = parts[0].strip() if parts[0].strip() else None
968
- start = float(parts[1]) / 1000.0 # Convert milliseconds to seconds
969
- end = float(parts[2]) / 1000.0
970
- text = "\t".join(parts[3:]).strip()
971
- else:
972
- # Format: start\tend\ttext
973
- start = float(parts[0]) / 1000.0 # Convert milliseconds to seconds
974
- end = float(parts[1]) / 1000.0
975
- text = "\t".join(parts[2:]).strip()
976
- speaker = None
977
-
978
- if normalize_text:
979
- text = normalize_text_fn(text)
980
-
981
- duration = end - start
982
- if duration < 0:
983
- continue
984
-
985
- supervisions.append(
986
- Supervision(
987
- text=text,
988
- start=start,
989
- duration=duration,
990
- speaker=speaker,
991
- )
992
- )
993
- except (ValueError, IndexError):
994
- # Skip malformed lines
995
- continue
996
-
997
- return supervisions
998
-
999
- @classmethod
1000
- def _parse_csv(cls, caption: Pathlike, normalize_text: Optional[bool] = False) -> List[Supervision]:
1001
- """
1002
- Parse CSV (Comma-Separated Values) format caption file.
1003
-
1004
- Format specifications:
1005
- - With speaker: speaker,start,end,text
1006
- - Without speaker: start,end,text
1007
- - Times are in milliseconds
1008
-
1009
- Args:
1010
- caption: Caption file path
1011
- normalize_text: Whether to normalize text
1012
-
1013
- Returns:
1014
- List of Supervision objects
1015
- """
1016
- import csv
1017
-
1018
- caption_path = Path(str(caption))
1019
- if not caption_path.exists():
1020
- raise FileNotFoundError(f"Caption file not found: {caption}")
1021
-
1022
- supervisions = []
1023
-
1024
- with open(caption_path, "r", encoding="utf-8", newline="") as f:
1025
- reader = csv.reader(f)
1026
- lines = list(reader)
1027
-
1028
- if not lines:
1029
- return supervisions
1030
-
1031
- # Check if first line is a header
1032
- first_line = [col.strip().lower() for col in lines[0]]
1033
- has_header = "start" in first_line and "end" in first_line and "text" in first_line
1034
- has_speaker_column = "speaker" in first_line
1035
-
1036
- start_idx = 1 if has_header else 0
1037
-
1038
- for parts in lines[start_idx:]:
1039
- if len(parts) < 3:
1040
- continue
1041
-
1042
- try:
1043
- if has_speaker_column and len(parts) >= 4:
1044
- # Format: speaker,start,end,text
1045
- speaker = parts[0].strip() if parts[0].strip() else None
1046
- start = float(parts[1]) / 1000.0 # Convert milliseconds to seconds
1047
- end = float(parts[2]) / 1000.0
1048
- text = ",".join(parts[3:]).strip()
1049
- else:
1050
- # Format: start,end,text
1051
- start = float(parts[0]) / 1000.0 # Convert milliseconds to seconds
1052
- end = float(parts[1]) / 1000.0
1053
- text = ",".join(parts[2:]).strip()
1054
- speaker = None
1055
-
1056
- if normalize_text:
1057
- text = normalize_text_fn(text)
1058
-
1059
- duration = end - start
1060
- if duration < 0:
1061
- continue
1062
-
1063
- supervisions.append(
1064
- Supervision(
1065
- text=text,
1066
- start=start,
1067
- duration=duration,
1068
- speaker=speaker,
1069
- )
1070
- )
1071
- except (ValueError, IndexError):
1072
- # Skip malformed lines
1073
- continue
1074
-
1075
- return supervisions
1076
-
1077
- @classmethod
1078
- def _parse_aud(cls, caption: Pathlike, normalize_text: Optional[bool] = False) -> List[Supervision]:
1079
- """
1080
- Parse AUD (Audacity Labels) format caption file.
1081
-
1082
- Format: start\tend\t[[speaker]]text
1083
- - Times are in seconds (float)
1084
- - Speaker is optional and enclosed in [[brackets]]
1085
-
1086
- Args:
1087
- caption: Caption file path
1088
- normalize_text: Whether to normalize text
1089
-
1090
- Returns:
1091
- List of Supervision objects
1092
- """
1093
- caption_path = Path(str(caption))
1094
- if not caption_path.exists():
1095
- raise FileNotFoundError(f"Caption file not found: {caption}")
1096
-
1097
- supervisions = []
1098
-
1099
- with open(caption_path, "r", encoding="utf-8") as f:
1100
- lines = f.readlines()
1101
-
1102
- for line in lines:
1103
- line = line.strip()
1104
- if not line:
1105
- continue
1106
-
1107
- parts = line.split("\t")
1108
- if len(parts) < 3:
1109
- continue
1110
-
1111
- try:
1112
- # AUD format: start\tend\ttext (speaker in [[brackets]])
1113
- start = float(parts[0])
1114
- end = float(parts[1])
1115
- text = "\t".join(parts[2:]).strip()
1116
-
1117
- # Extract speaker from [[speaker]] prefix
1118
- speaker = None
1119
- speaker_match = re.match(r"^\[\[([^\]]+)\]\]\s*(.*)$", text)
1120
- if speaker_match:
1121
- speaker = speaker_match.group(1)
1122
- text = speaker_match.group(2)
1123
-
1124
- if normalize_text:
1125
- text = normalize_text_fn(text)
1126
-
1127
- duration = end - start
1128
- if duration < 0:
1129
- continue
1130
-
1131
- supervisions.append(
1132
- Supervision(
1133
- text=text,
1134
- start=start,
1135
- duration=duration,
1136
- speaker=speaker,
1137
- )
1138
- )
1139
- except (ValueError, IndexError):
1140
- # Skip malformed lines
1141
- continue
1142
-
1143
- return supervisions
1144
-
1145
- @classmethod
1146
- def _parse_sbv(cls, caption: Pathlike, normalize_text: Optional[bool] = False) -> List[Supervision]:
1147
- """
1148
- Parse SubViewer (SBV) format caption file.
1149
-
1150
- Format:
1151
- 0:00:00.000,0:00:02.000
1152
- Text line 1
1153
-
1154
- 0:00:02.000,0:00:04.000
1155
- Text line 2
1156
-
1157
- Args:
1158
- caption: Caption file path
1159
- normalize_text: Whether to normalize text
1160
-
1161
- Returns:
1162
- List of Supervision objects
1163
- """
1164
- caption_path = Path(str(caption))
1165
- if not caption_path.exists():
1166
- raise FileNotFoundError(f"Caption file not found: {caption}")
1167
-
1168
- supervisions = []
1169
-
1170
- with open(caption_path, "r", encoding="utf-8") as f:
1171
- content = f.read()
1172
-
1173
- # Split by double newlines to separate entries
1174
- entries = content.strip().split("\n\n")
1175
-
1176
- for entry in entries:
1177
- lines = entry.strip().split("\n")
1178
- if len(lines) < 2:
1179
- continue
1180
-
1181
- # First line: timestamp (H:MM:SS.mmm,H:MM:SS.mmm)
1182
- timestamp_line = lines[0].strip()
1183
- # Remaining lines: text
1184
- text_lines = lines[1:]
1185
-
1186
- try:
1187
- # Parse timestamp: 0:00:00.000,0:00:02.000
1188
- if "," not in timestamp_line:
1189
- continue
1190
-
1191
- start_str, end_str = timestamp_line.split(",", 1)
1192
-
1193
- # Parse start time
1194
- start_parts = start_str.strip().split(":")
1195
- if len(start_parts) == 3:
1196
- h, m, s = start_parts
1197
- s_parts = s.split(".")
1198
- start = int(h) * 3600 + int(m) * 60 + int(s_parts[0])
1199
- if len(s_parts) > 1:
1200
- start += int(s_parts[1]) / 1000.0
1201
- else:
1202
- continue
1203
-
1204
- # Parse end time
1205
- end_parts = end_str.strip().split(":")
1206
- if len(end_parts) == 3:
1207
- h, m, s = end_parts
1208
- s_parts = s.split(".")
1209
- end = int(h) * 3600 + int(m) * 60 + int(s_parts[0])
1210
- if len(s_parts) > 1:
1211
- end += int(s_parts[1]) / 1000.0
1212
- else:
1213
- continue
1214
-
1215
- # Parse text and speaker
1216
- text = " ".join(text_lines).strip()
1217
- speaker, text = parse_speaker_text(text)
1218
-
1219
- if normalize_text:
1220
- text = normalize_text_fn(text)
1221
-
1222
- duration = end - start
1223
- if duration < 0:
1224
- continue
1225
-
1226
- supervisions.append(
1227
- Supervision(
1228
- text=text,
1229
- start=start,
1230
- duration=duration,
1231
- speaker=speaker,
1232
- )
1233
- )
1234
- except (ValueError, IndexError):
1235
- # Skip malformed entries
1236
- continue
1237
-
1238
- return supervisions
1239
-
1240
- @classmethod
1241
- def _write_tsv(
1242
- cls,
1243
- alignments: List[Supervision],
1244
- output_path: Pathlike,
1245
- include_speaker_in_text: bool = True,
1246
- ) -> None:
1247
- """
1248
- Write caption to TSV format.
1249
-
1250
- Format: speaker\tstart\tend\ttext (with speaker)
1251
- or: start\tend\ttext (without speaker)
1252
-
1253
- Args:
1254
- alignments: List of supervision segments to write
1255
- output_path: Path to output TSV file
1256
- include_speaker_in_text: Whether to include speaker column
1257
- """
1258
- with open(output_path, "w", encoding="utf-8") as file:
1259
- # Write header
1260
- if include_speaker_in_text:
1261
- file.write("speaker\tstart\tend\ttext\n")
1262
- for supervision in alignments:
1263
- # Respect `original_speaker` custom flag: default to True when missing
1264
- include_speaker = supervision.speaker and (
1265
- not supervision.has_custom("original_speaker") or supervision.custom["original_speaker"]
1266
- )
1267
- speaker = supervision.speaker if include_speaker else ""
1268
- start_ms = round(1000 * supervision.start)
1269
- end_ms = round(1000 * supervision.end)
1270
- text = supervision.text.strip().replace("\t", " ")
1271
- file.write(f"{speaker}\t{start_ms}\t{end_ms}\t{text}\n")
1272
- else:
1273
- file.write("start\tend\ttext\n")
1274
- for supervision in alignments:
1275
- start_ms = round(1000 * supervision.start)
1276
- end_ms = round(1000 * supervision.end)
1277
- text = supervision.text.strip().replace("\t", " ")
1278
- file.write(f"{start_ms}\t{end_ms}\t{text}\n")
1279
-
1280
- @classmethod
1281
- def _write_csv(
1282
- cls,
1283
- alignments: List[Supervision],
1284
- output_path: Pathlike,
1285
- include_speaker_in_text: bool = True,
1286
- ) -> None:
1287
- """
1288
- Write caption to CSV format.
1289
-
1290
- Format: speaker,start,end,text (with speaker)
1291
- or: start,end,text (without speaker)
1292
-
1293
- Args:
1294
- alignments: List of supervision segments to write
1295
- output_path: Path to output CSV file
1296
- include_speaker_in_text: Whether to include speaker column
1297
- """
1298
- import csv
1299
-
1300
- with open(output_path, "w", encoding="utf-8", newline="") as file:
1301
- if include_speaker_in_text:
1302
- writer = csv.writer(file)
1303
- writer.writerow(["speaker", "start", "end", "text"])
1304
- for supervision in alignments:
1305
- include_speaker = supervision.speaker and (
1306
- not supervision.has_custom("original_speaker") or supervision.custom["original_speaker"]
1307
- )
1308
- speaker = supervision.speaker if include_speaker else ""
1309
- start_ms = round(1000 * supervision.start)
1310
- end_ms = round(1000 * supervision.end)
1311
- text = supervision.text.strip()
1312
- writer.writerow([speaker, start_ms, end_ms, text])
1313
- else:
1314
- writer = csv.writer(file)
1315
- writer.writerow(["start", "end", "text"])
1316
- for supervision in alignments:
1317
- start_ms = round(1000 * supervision.start)
1318
- end_ms = round(1000 * supervision.end)
1319
- text = supervision.text.strip()
1320
- writer.writerow([start_ms, end_ms, text])
1321
-
1322
- @classmethod
1323
- def _write_aud(
1324
- cls,
1325
- alignments: List[Supervision],
1326
- output_path: Pathlike,
1327
- include_speaker_in_text: bool = True,
1328
- ) -> None:
1329
- """
1330
- Write caption to AUD format.
1331
-
1332
- Format: start\tend\t[[speaker]]text
1333
- or: start\tend\ttext (without speaker)
1334
-
1335
- Args:
1336
- alignments: List of supervision segments to write
1337
- output_path: Path to output AUD file
1338
- include_speaker_in_text: Whether to include speaker in [[brackets]]
1339
- """
1340
- with open(output_path, "w", encoding="utf-8") as file:
1341
- for supervision in alignments:
1342
- start = supervision.start
1343
- end = supervision.end
1344
- text = supervision.text.strip().replace("\t", " ")
1345
-
1346
- # Respect `original_speaker` custom flag when adding speaker prefix
1347
- if (
1348
- include_speaker_in_text
1349
- and supervision.speaker
1350
- and (not supervision.has_custom("original_speaker") or supervision.custom["original_speaker"])
1351
- ):
1352
- text = f"[[{supervision.speaker}]]{text}"
1353
-
1354
- file.write(f"{start}\t{end}\t{text}\n")
1355
-
1356
- @classmethod
1357
- def _write_sbv(
1358
- cls,
1359
- alignments: List[Supervision],
1360
- output_path: Pathlike,
1361
- include_speaker_in_text: bool = True,
1362
- ) -> None:
1363
- """
1364
- Write caption to SubViewer (SBV) format.
1365
-
1366
- Format:
1367
- 0:00:00.000,0:00:02.000
1368
- Text line 1
1369
-
1370
- 0:00:02.000,0:00:04.000
1371
- Text line 2
1372
-
1373
- Args:
1374
- alignments: List of supervision segments to write
1375
- output_path: Path to output SBV file
1376
- include_speaker_in_text: Whether to include speaker in text
1377
- """
1378
- with open(output_path, "w", encoding="utf-8") as file:
1379
- for i, supervision in enumerate(alignments):
1380
- # Format timestamps as H:MM:SS.mmm
1381
- start_h = int(supervision.start // 3600)
1382
- start_m = int((supervision.start % 3600) // 60)
1383
- start_s = int(supervision.start % 60)
1384
- start_ms = int((supervision.start % 1) * 1000)
1385
-
1386
- end_h = int(supervision.end // 3600)
1387
- end_m = int((supervision.end % 3600) // 60)
1388
- end_s = int(supervision.end % 60)
1389
- end_ms = int((supervision.end % 1) * 1000)
1390
-
1391
- start_time = f"{start_h}:{start_m:02d}:{start_s:02d}.{start_ms:03d}"
1392
- end_time = f"{end_h}:{end_m:02d}:{end_s:02d}.{end_ms:03d}"
1393
-
1394
- # Write timestamp line
1395
- file.write(f"{start_time},{end_time}\n")
1396
-
1397
- # Write text (with optional speaker). Respect `original_speaker` custom flag.
1398
- text = supervision.text.strip()
1399
- if (
1400
- include_speaker_in_text
1401
- and supervision.speaker
1402
- and (not supervision.has_custom("original_speaker") or supervision.custom["original_speaker"])
1403
- ):
1404
- text = f"{supervision.speaker}: {text}"
1405
-
1406
- file.write(f"{text}\n")
1407
-
1408
- # Add blank line between entries (except after last one)
1409
- if i < len(alignments) - 1:
1410
- file.write("\n")
1411
-
1412
- @classmethod
1413
- def _parse_caption(
1414
- cls, caption: Pathlike, format: Optional[OutputCaptionFormat], normalize_text: Optional[bool] = False
1415
- ) -> List[Supervision]:
1416
- """
1417
- Parse caption using pysubs2.
1418
-
1419
- Args:
1420
- caption: Caption file path or content
1421
- format: Caption format
1422
- normalize_text: Whether to normalize text
1423
-
1424
- Returns:
1425
- List of Supervision objects
1426
- """
1427
- import pysubs2
1428
-
1429
- try:
1430
- subs: pysubs2.SSAFile = pysubs2.load(
1431
- caption, encoding="utf-8", format_=format if format != "auto" else None
1432
- ) # file
1433
- except IOError:
1434
- try:
1435
- subs: pysubs2.SSAFile = pysubs2.SSAFile.from_string(
1436
- caption, format_=format if format != "auto" else None
1437
- ) # str
1438
- except Exception as e:
1439
- del e
1440
- subs: pysubs2.SSAFile = pysubs2.load(caption, encoding="utf-8") # auto detect format
1441
-
1442
- # Parse supervisions
1443
- supervisions = []
1444
- for event in subs.events:
1445
- if normalize_text:
1446
- event.text = normalize_text_fn(event.text)
1447
- speaker, text = parse_speaker_text(event.text)
1448
- supervisions.append(
1449
- Supervision(
1450
- text=text,
1451
- speaker=speaker or event.name,
1452
- start=event.start / 1000.0 if event.start is not None else None,
1453
- duration=(event.end - event.start) / 1000.0 if event.end is not None else None,
1454
- )
1455
- )
1456
- return supervisions
1457
-
1458
- def __repr__(self) -> str:
1459
- """String representation of Caption."""
1460
- lang = f"lang={self.language}" if self.language else "lang=unknown"
1461
- kind_str = f"kind={self.kind}" if self.kind else ""
1462
- parts = [f"Caption({len(self.supervisions or self.transcription)} segments", lang]
1463
- if kind_str:
1464
- parts.append(kind_str)
1465
- if self.duration:
1466
- parts.append(f"duration={self.duration:.2f}s")
1467
- return ", ".join(parts) + ")"