lattifai 1.2.1__py3-none-any.whl → 1.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. lattifai/alignment/__init__.py +10 -1
  2. lattifai/alignment/lattice1_aligner.py +66 -58
  3. lattifai/alignment/punctuation.py +38 -0
  4. lattifai/alignment/sentence_splitter.py +152 -21
  5. lattifai/alignment/text_align.py +440 -0
  6. lattifai/alignment/tokenizer.py +82 -40
  7. lattifai/caption/__init__.py +82 -6
  8. lattifai/caption/caption.py +335 -1141
  9. lattifai/caption/formats/__init__.py +199 -0
  10. lattifai/caption/formats/base.py +211 -0
  11. lattifai/caption/{gemini_reader.py → formats/gemini.py} +320 -60
  12. lattifai/caption/formats/json.py +194 -0
  13. lattifai/caption/formats/lrc.py +309 -0
  14. lattifai/caption/formats/nle/__init__.py +9 -0
  15. lattifai/caption/formats/nle/audition.py +561 -0
  16. lattifai/caption/formats/nle/avid.py +423 -0
  17. lattifai/caption/formats/nle/fcpxml.py +549 -0
  18. lattifai/caption/formats/nle/premiere.py +589 -0
  19. lattifai/caption/formats/pysubs2.py +642 -0
  20. lattifai/caption/formats/sbv.py +147 -0
  21. lattifai/caption/formats/tabular.py +338 -0
  22. lattifai/caption/formats/textgrid.py +193 -0
  23. lattifai/caption/formats/ttml.py +652 -0
  24. lattifai/caption/formats/vtt.py +469 -0
  25. lattifai/caption/parsers/__init__.py +9 -0
  26. lattifai/caption/{text_parser.py → parsers/text_parser.py} +4 -2
  27. lattifai/caption/standardize.py +636 -0
  28. lattifai/caption/utils.py +474 -0
  29. lattifai/cli/__init__.py +2 -1
  30. lattifai/cli/caption.py +108 -1
  31. lattifai/cli/transcribe.py +1 -1
  32. lattifai/cli/youtube.py +4 -1
  33. lattifai/client.py +33 -113
  34. lattifai/config/__init__.py +11 -1
  35. lattifai/config/alignment.py +7 -0
  36. lattifai/config/caption.py +267 -23
  37. lattifai/config/media.py +20 -0
  38. lattifai/diarization/__init__.py +41 -1
  39. lattifai/mixin.py +27 -15
  40. lattifai/transcription/base.py +6 -1
  41. lattifai/transcription/lattifai.py +19 -54
  42. lattifai/utils.py +7 -13
  43. lattifai/workflow/__init__.py +28 -4
  44. lattifai/workflow/file_manager.py +2 -5
  45. lattifai/youtube/__init__.py +43 -0
  46. lattifai/youtube/client.py +1170 -0
  47. lattifai/youtube/types.py +23 -0
  48. lattifai-1.2.2.dist-info/METADATA +615 -0
  49. lattifai-1.2.2.dist-info/RECORD +76 -0
  50. {lattifai-1.2.1.dist-info → lattifai-1.2.2.dist-info}/entry_points.txt +1 -2
  51. lattifai/caption/gemini_writer.py +0 -173
  52. lattifai/cli/app_installer.py +0 -142
  53. lattifai/cli/server.py +0 -44
  54. lattifai/server/app.py +0 -427
  55. lattifai/workflow/youtube.py +0 -577
  56. lattifai-1.2.1.dist-info/METADATA +0 -1134
  57. lattifai-1.2.1.dist-info/RECORD +0 -58
  58. {lattifai-1.2.1.dist-info → lattifai-1.2.2.dist-info}/WHEEL +0 -0
  59. {lattifai-1.2.1.dist-info → lattifai-1.2.2.dist-info}/licenses/LICENSE +0 -0
  60. {lattifai-1.2.1.dist-info → lattifai-1.2.2.dist-info}/top_level.txt +0 -0
@@ -1,19 +1,22 @@
1
1
  """Caption data structure for storing subtitle information with metadata."""
2
2
 
3
- import json
4
- import re
3
+ from __future__ import annotations
4
+
5
+ import io
5
6
  from dataclasses import dataclass, field
6
7
  from pathlib import Path
7
- from typing import Any, Dict, List, Optional, TypeVar
8
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeVar, Union
9
+
10
+ if TYPE_CHECKING:
11
+ from ..config.caption import KaraokeConfig
8
12
 
9
13
  from lhotse.supervision import AlignmentItem
10
14
  from lhotse.utils import Pathlike
11
15
  from tgt import TextGrid
12
16
 
13
17
  from ..config.caption import InputCaptionFormat, OutputCaptionFormat # noqa: F401
18
+ from .formats import detect_format, get_reader, get_writer
14
19
  from .supervision import Supervision
15
- from .text_parser import normalize_text as normalize_text_fn
16
- from .text_parser import parse_speaker_text, parse_timestamp_text
17
20
 
18
21
  DiarizationOutput = TypeVar("DiarizationOutput")
19
22
 
@@ -50,7 +53,7 @@ class Caption:
50
53
  kind: Optional[str] = None
51
54
  source_format: Optional[str] = None
52
55
  source_path: Optional[Pathlike] = None
53
- metadata: Dict[str, str] = field(default_factory=dict)
56
+ metadata: Dict[str, Any] = field(default_factory=dict)
54
57
 
55
58
  def __len__(self) -> int:
56
59
  """Return the number of supervision segments."""
@@ -66,7 +69,7 @@ class Caption:
66
69
 
67
70
  def __bool__(self) -> bool:
68
71
  """Return True if caption has supervisions."""
69
- return self.__len__() > 0
72
+ return len(self) > 0
70
73
 
71
74
  @property
72
75
  def is_empty(self) -> bool:
@@ -147,19 +150,72 @@ class Caption:
147
150
  Returns:
148
151
  New Caption instance with shifted timestamps
149
152
  """
150
- shifted_sups = [
151
- Supervision(
152
- text=sup.text,
153
- start=sup.start + seconds,
154
- duration=sup.duration,
155
- speaker=sup.speaker,
156
- id=sup.id,
157
- language=sup.language,
158
- alignment=sup.alignment if hasattr(sup, "alignment") else None,
159
- custom=sup.custom,
153
+ shifted_sups = []
154
+ for sup in self.supervisions:
155
+ # Calculate physical time range
156
+ raw_start = sup.start + seconds
157
+ raw_end = sup.end + seconds
158
+
159
+ # Skip segments that end before 0
160
+ if raw_end <= 0:
161
+ continue
162
+
163
+ # Clip start to 0 if negative
164
+ if raw_start < 0:
165
+ final_start = 0.0
166
+ final_duration = raw_end
167
+ else:
168
+ final_start = raw_start
169
+ final_duration = sup.duration
170
+
171
+ # Handle alignment (word-level timestamps)
172
+ final_alignment = None
173
+ original_alignment = getattr(sup, "alignment", None)
174
+ if original_alignment and "word" in original_alignment:
175
+ new_words = []
176
+ for word in original_alignment["word"]:
177
+ w_start = word.start + seconds
178
+ w_end = w_start + word.duration
179
+
180
+ # Skip words that end before 0
181
+ if w_end <= 0:
182
+ continue
183
+
184
+ # Clip start to 0 if negative
185
+ if w_start < 0:
186
+ w_final_start = 0.0
187
+ w_final_duration = w_end
188
+ else:
189
+ w_final_start = w_start
190
+ w_final_duration = word.duration
191
+
192
+ new_words.append(
193
+ AlignmentItem(
194
+ symbol=word.symbol,
195
+ start=w_final_start,
196
+ duration=w_final_duration,
197
+ score=word.score,
198
+ )
199
+ )
200
+
201
+ # Copy original alignment dict structure and update words
202
+ final_alignment = original_alignment.copy()
203
+ final_alignment["word"] = new_words
204
+
205
+ shifted_sups.append(
206
+ Supervision(
207
+ text=sup.text,
208
+ start=final_start,
209
+ duration=final_duration,
210
+ speaker=sup.speaker,
211
+ id=sup.id,
212
+ recording_id=sup.recording_id if hasattr(sup, "recording_id") else "",
213
+ channel=getattr(sup, "channel", 0),
214
+ language=sup.language,
215
+ alignment=final_alignment,
216
+ custom=sup.custom,
217
+ )
160
218
  )
161
- for sup in self.supervisions
162
- ]
163
219
 
164
220
  return Caption(
165
221
  supervisions=shifted_sups,
@@ -170,52 +226,90 @@ class Caption:
170
226
  metadata=self.metadata.copy(),
171
227
  )
172
228
 
173
- def to_string(self, format: str = "srt") -> str:
229
+ def with_margins(
230
+ self,
231
+ start_margin: float = 0.08,
232
+ end_margin: float = 0.20,
233
+ min_gap: float = 0.08,
234
+ collision_mode: str = "trim",
235
+ ) -> "Caption":
174
236
  """
175
- Return caption content in specified format.
237
+ Create a new Caption with segment boundaries adjusted based on word-level alignment.
238
+
239
+ Uses supervision.alignment['word'] to recalculate segment start/end times
240
+ with the specified margins applied around the actual speech boundaries.
176
241
 
177
242
  Args:
178
- format: Output format (e.g., 'srt', 'vtt', 'ass')
243
+ start_margin: Seconds to extend before the first word (default: 0.08)
244
+ end_margin: Seconds to extend after the last word (default: 0.20)
245
+ min_gap: Minimum gap between segments for collision handling (default: 0.08)
246
+ collision_mode: How to handle segment overlap - 'trim' or 'gap' (default: 'trim')
179
247
 
180
248
  Returns:
181
- String containing formatted captions
182
- """
183
- import pysubs2
249
+ New Caption instance with adjusted timestamps
250
+
251
+ Note:
252
+ Segments without alignment data will keep their original timestamps.
184
253
 
185
- subs = pysubs2.SSAFile()
254
+ Example:
255
+ >>> caption = Caption.read("aligned.srt")
256
+ >>> adjusted = caption.with_margins(start_margin=0.05, end_margin=0.15)
257
+ >>> adjusted.write("output.srt")
258
+ """
259
+ from .standardize import apply_margins_to_captions
186
260
 
261
+ # Determine which supervisions to use
187
262
  if self.alignments:
188
- alignments = self.alignments
263
+ source_sups = self.alignments
264
+ elif self.supervisions:
265
+ source_sups = self.supervisions
189
266
  else:
190
- alignments = self.supervisions
191
-
192
- if not alignments:
193
- alignments = self.transcription
194
-
195
- for sup in alignments:
196
- # Add word-level timing as metadata in the caption text
197
- word_items = self._parse_alignment_from_supervision(sup)
198
- if word_items:
199
- for word in word_items:
200
- subs.append(
201
- pysubs2.SSAEvent(
202
- start=int(word.start * 1000),
203
- end=int(word.end * 1000),
204
- text=word.symbol,
205
- name=sup.speaker or "",
206
- )
207
- )
208
- else:
209
- subs.append(
210
- pysubs2.SSAEvent(
211
- start=int(sup.start * 1000),
212
- end=int(sup.end * 1000),
213
- text=sup.text or "",
214
- name=sup.speaker or "",
215
- )
216
- )
267
+ source_sups = self.transcription
268
+
269
+ adjusted_sups = apply_margins_to_captions(
270
+ source_sups,
271
+ start_margin=start_margin,
272
+ end_margin=end_margin,
273
+ min_gap=min_gap,
274
+ collision_mode=collision_mode,
275
+ )
276
+
277
+ return Caption(
278
+ supervisions=adjusted_sups,
279
+ transcription=self.transcription,
280
+ audio_events=self.audio_events,
281
+ speaker_diarization=self.speaker_diarization,
282
+ alignments=[], # Clear alignments since we've applied them
283
+ language=self.language,
284
+ kind=self.kind,
285
+ source_format=self.source_format,
286
+ source_path=self.source_path,
287
+ metadata=self.metadata.copy(),
288
+ )
289
+
290
+ def to_string(
291
+ self,
292
+ format: str = "srt",
293
+ word_level: bool = False,
294
+ karaoke_config: Optional["KaraokeConfig"] = None,
295
+ metadata: Optional[Dict[str, Any]] = None,
296
+ ) -> str:
297
+ """
298
+ Return caption content in specified format.
217
299
 
218
- return subs.to_string(format_=format)
300
+ Args:
301
+ format: Output format (e.g., 'srt', 'vtt', 'ass')
302
+ word_level: Use word-level output format if supported (e.g., LRC, ASS, TTML)
303
+ karaoke_config: Karaoke configuration. When provided with enabled=True,
304
+ enables karaoke styling (ASS \\kf tags, enhanced LRC, etc.)
305
+ metadata: Optional metadata dict to pass to writer. If None, uses self.metadata.
306
+
307
+ Returns:
308
+ String containing formatted captions
309
+ """
310
+ return self.to_bytes(
311
+ output_format=format, word_level=word_level, karaoke_config=karaoke_config, metadata=metadata
312
+ ).decode("utf-8")
219
313
 
220
314
  def to_dict(self) -> Dict:
221
315
  """
@@ -269,6 +363,71 @@ class Caption:
269
363
  metadata=metadata or {},
270
364
  )
271
365
 
366
+ @classmethod
367
+ def from_string(
368
+ cls,
369
+ content: str,
370
+ format: str,
371
+ normalize_text: bool = True,
372
+ ) -> "Caption":
373
+ """
374
+ Create Caption from string content.
375
+
376
+ Args:
377
+ content: Caption content as string
378
+ format: Caption format (e.g., 'srt', 'vtt', 'ass')
379
+ normalize_text: Whether to normalize text during reading
380
+
381
+ Returns:
382
+ New Caption instance
383
+
384
+ Example:
385
+ >>> srt_content = \"\"\"1
386
+ ... 00:00:00,000 --> 00:00:02,000
387
+ ... Hello world\"\"\"
388
+ >>> caption = Caption.from_string(srt_content, format=\"srt\")
389
+ """
390
+ buffer = io.StringIO(content)
391
+ return cls.read(buffer, format=format, normalize_text=normalize_text)
392
+
393
+ def to_bytes(
394
+ self,
395
+ output_format: Optional[str] = None,
396
+ include_speaker_in_text: bool = True,
397
+ word_level: bool = False,
398
+ karaoke_config: Optional["KaraokeConfig"] = None,
399
+ metadata: Optional[Dict[str, Any]] = None,
400
+ ) -> bytes:
401
+ """
402
+ Convert caption to bytes.
403
+
404
+ Args:
405
+ output_format: Output format (e.g., 'srt', 'vtt', 'ass'). Defaults to source_format or 'srt'
406
+ include_speaker_in_text: Whether to include speaker labels in text
407
+ word_level: Use word-level output format if supported (e.g., LRC, ASS, TTML)
408
+ karaoke_config: Karaoke configuration. When provided with enabled=True,
409
+ enables karaoke styling (ASS \\kf tags, enhanced LRC, etc.)
410
+ metadata: Optional metadata dict to pass to writer. If None, uses self.metadata.
411
+
412
+ Returns:
413
+ Caption content as bytes
414
+
415
+ Example:
416
+ >>> caption = Caption.read("input.srt")
417
+ >>> # Get as bytes in original format
418
+ >>> data = caption.to_bytes()
419
+ >>> # Get as bytes in specific format
420
+ >>> vtt_data = caption.to_bytes(output_format="vtt")
421
+ """
422
+ return self.write(
423
+ None,
424
+ output_format=output_format,
425
+ include_speaker_in_text=include_speaker_in_text,
426
+ word_level=word_level,
427
+ karaoke_config=karaoke_config,
428
+ metadata=metadata,
429
+ )
430
+
272
431
  @classmethod
273
432
  def from_transcription_results(
274
433
  cls,
@@ -307,82 +466,168 @@ class Caption:
307
466
  @classmethod
308
467
  def read(
309
468
  cls,
310
- path: Pathlike,
469
+ path: Union[Pathlike, io.BytesIO, io.StringIO],
311
470
  format: Optional[str] = None,
312
471
  normalize_text: bool = True,
313
472
  ) -> "Caption":
314
473
  """
315
- Read caption file and return Caption object.
474
+ Read caption file or in-memory data and return Caption object.
316
475
 
317
476
  Args:
318
- path: Path to caption file
319
- format: Caption format (auto-detected if not provided)
477
+ path: Path to caption file, or BytesIO/StringIO object with caption content
478
+ format: Caption format (auto-detected if not provided, required for in-memory data)
320
479
  normalize_text: Whether to normalize text during reading
321
480
 
322
481
  Returns:
323
482
  Caption object containing supervisions and metadata
324
-
325
- Example:
326
- >>> caption = Caption.read("subtitles.srt")
327
- >>> print(f"Loaded {len(caption)} segments")
328
483
  """
329
- caption_path = Path(str(path)) if not isinstance(path, Path) else path
330
-
331
484
  # Detect format if not provided
332
- if not format and caption_path.exists():
333
- format = caption_path.suffix.lstrip(".").lower()
334
- elif format:
335
- format = format.lower()
485
+ if not format:
486
+ if isinstance(path, (io.BytesIO, io.StringIO)):
487
+ raise ValueError("format parameter is required when reading from BytesIO/StringIO")
488
+ format = detect_format(str(path))
489
+
490
+ if not format:
491
+ # Fallback to extension
492
+ if not isinstance(path, (io.BytesIO, io.StringIO)):
493
+ format = Path(str(path)).suffix.lstrip(".").lower()
494
+
495
+ if not format:
496
+ format = "srt" # Last resort default
336
497
 
337
- # Extract metadata from file
338
- metadata = cls._extract_metadata(path, format)
498
+ # Get content if it's an in-memory buffer
499
+ source = path
500
+ if isinstance(path, io.BytesIO):
501
+ source = path.read().decode("utf-8")
502
+ elif isinstance(path, io.StringIO):
503
+ source = path.read()
339
504
 
340
- # Parse supervisions
341
- supervisions = cls._parse_supervisions(path, format, normalize_text)
505
+ # Reset buffer position if it was a stream
506
+ if isinstance(path, (io.BytesIO, io.StringIO)):
507
+ path.seek(0)
508
+
509
+ # Get reader and perform extraction
510
+ reader_cls = get_reader(format)
511
+ if not reader_cls:
512
+ # Use pysubs2 as a generic fallback if no specific reader exists
513
+ from .formats.pysubs2 import Pysubs2Format
514
+
515
+ reader_cls = Pysubs2Format
516
+
517
+ supervisions = reader_cls.read(source, normalize_text=normalize_text)
518
+ metadata = reader_cls.extract_metadata(source)
342
519
 
343
520
  # Create Caption object
521
+ source_path = None
522
+ if isinstance(path, (str, Path)) and not ("\n" in str(path) or len(str(path)) > 500):
523
+ try:
524
+ p = Path(str(path))
525
+ if p.exists():
526
+ source_path = str(p)
527
+ except (OSError, ValueError):
528
+ pass
529
+
344
530
  return cls(
345
531
  supervisions=supervisions,
346
532
  language=metadata.get("language"),
347
533
  kind=metadata.get("kind"),
348
534
  source_format=format,
349
- source_path=str(caption_path) if caption_path.exists() else None,
535
+ source_path=source_path,
350
536
  metadata=metadata,
351
537
  )
352
538
 
353
539
  def write(
354
540
  self,
355
- path: Pathlike,
541
+ path: Union[Pathlike, io.BytesIO, None] = None,
542
+ output_format: Optional[str] = None,
356
543
  include_speaker_in_text: bool = True,
357
- ) -> Pathlike:
544
+ word_level: bool = False,
545
+ karaoke_config: Optional["KaraokeConfig"] = None,
546
+ metadata: Optional[Dict[str, Any]] = None,
547
+ ) -> Union[Pathlike, bytes]:
358
548
  """
359
- Write caption to file.
549
+ Write caption to file or return as bytes.
360
550
 
361
551
  Args:
362
- path: Path to output caption file
552
+ path: Path to output caption file, BytesIO object, or None to return bytes
553
+ output_format: Output format (e.g., 'srt', 'vtt', 'ass')
363
554
  include_speaker_in_text: Whether to include speaker labels in text
555
+ word_level: Use word-level output format if supported (e.g., LRC, ASS, TTML)
556
+ karaoke_config: Karaoke configuration. When provided with enabled=True,
557
+ enables karaoke styling (ASS \\kf tags, enhanced LRC, etc.)
558
+ metadata: Optional metadata dict to pass to writer. If None, uses self.metadata.
559
+ Can be used to override or supplement format-specific metadata.
364
560
 
365
561
  Returns:
366
- Path to the written file
367
-
368
- Example:
369
- >>> caption = Caption.read("input.srt")
370
- >>> caption.write("output.vtt", include_speaker_in_text=False)
562
+ Path to the written file if path is a file path, or bytes if path is BytesIO/None
371
563
  """
372
564
  if self.alignments:
373
- alignments = self.alignments
565
+ supervisions = self.alignments
566
+ elif self.supervisions:
567
+ supervisions = self.supervisions
374
568
  else:
375
- alignments = self.supervisions
376
-
377
- if not alignments:
378
- alignments = self.transcription
569
+ supervisions = self.transcription
570
+
571
+ # Merge external metadata with self.metadata (external takes precedence)
572
+ effective_metadata = dict(self.metadata) if self.metadata else {}
573
+ if metadata:
574
+ effective_metadata.update(metadata)
575
+
576
+ # Determine output format
577
+ if output_format:
578
+ output_format = output_format.lower()
579
+ elif isinstance(path, (io.BytesIO, type(None))):
580
+ output_format = self.source_format or "srt"
581
+ else:
582
+ output_format = detect_format(str(path)) or Path(str(path)).suffix.lstrip(".").lower() or "srt"
583
+
584
+ # Special casing for professional formats as before
585
+ ext = output_format
586
+ if isinstance(path, (str, Path)):
587
+ path_str = str(path)
588
+ if path_str.endswith("_avid.txt"):
589
+ ext = "avid_ds"
590
+ elif "audition" in path_str.lower() and path_str.endswith(".csv"):
591
+ ext = "audition_csv"
592
+ elif "edimarker" in path_str.lower() and path_str.endswith(".csv"):
593
+ ext = "edimarker_csv"
594
+ elif "imsc" in path_str.lower() and path_str.endswith(".ttml"):
595
+ ext = "imsc1"
596
+ elif "ebu" in path_str.lower() and path_str.endswith(".ttml"):
597
+ ext = "ebu_tt_d"
598
+
599
+ writer_cls = get_writer(ext)
600
+ if not writer_cls:
601
+ from .formats.pysubs2 import Pysubs2Format
602
+
603
+ writer_cls = Pysubs2Format
604
+
605
+ if isinstance(path, (str, Path)):
606
+ return writer_cls.write(
607
+ supervisions,
608
+ path,
609
+ include_speaker=include_speaker_in_text,
610
+ word_level=word_level,
611
+ karaoke_config=karaoke_config,
612
+ metadata=effective_metadata,
613
+ )
379
614
 
380
- return self._write_caption(alignments, path, include_speaker_in_text)
615
+ content = writer_cls.to_bytes(
616
+ supervisions,
617
+ include_speaker=include_speaker_in_text,
618
+ word_level=word_level,
619
+ karaoke_config=karaoke_config,
620
+ metadata=effective_metadata,
621
+ )
622
+ if isinstance(path, io.BytesIO):
623
+ path.write(content)
624
+ path.seek(0)
625
+ return content
381
626
 
382
627
  def read_speaker_diarization(
383
628
  self,
384
629
  path: Pathlike,
385
- ) -> TextGrid:
630
+ ) -> "DiarizationOutput":
386
631
  """
387
632
  Read speaker diarization TextGrid from file.
388
633
  """
@@ -404,1057 +649,6 @@ class Caption:
404
649
  self.speaker_diarization.write(path)
405
650
  return path
406
651
 
407
- @staticmethod
408
- def _parse_alignment_from_supervision(supervision: Any) -> Optional[List[AlignmentItem]]:
409
- """
410
- Extract word-level alignment items from Supervision object.
411
-
412
- Args:
413
- supervision: Supervision object with potential alignment data
414
-
415
- Returns:
416
- List of AlignmentItem objects, or None if no alignment data present
417
- """
418
- if not hasattr(supervision, "alignment") or not supervision.alignment:
419
- return None
420
-
421
- if "word" not in supervision.alignment:
422
- return None
423
-
424
- return supervision.alignment["word"]
425
-
426
- @classmethod
427
- def _write_caption(
428
- cls,
429
- alignments: List[Supervision],
430
- output_path: Pathlike,
431
- include_speaker_in_text: bool = True,
432
- ) -> Pathlike:
433
- """
434
- Write caption to file in various formats.
435
-
436
- Args:
437
- alignments: List of supervision segments to write
438
- output_path: Path to output file
439
- include_speaker_in_text: Whether to include speaker in text
440
-
441
- Returns:
442
- Path to written file
443
- """
444
- if str(output_path)[-4:].lower() == ".txt":
445
- with open(output_path, "w", encoding="utf-8") as f:
446
- for sup in alignments:
447
- word_items = cls._parse_alignment_from_supervision(sup)
448
- if word_items:
449
- for item in word_items:
450
- f.write(f"[{item.start:.2f}-{item.end:.2f}] {item.symbol}\n")
451
- else:
452
- if include_speaker_in_text and sup.speaker is not None:
453
- # Use [SPEAKER]: format for consistency with parsing
454
- if not sup.has_custom("original_speaker") or sup.custom["original_speaker"]:
455
- text = f"[{sup.speaker}]: {sup.text}"
456
- else:
457
- text = f"{sup.text}"
458
- else:
459
- text = sup.text
460
- f.write(f"[{sup.start:.2f}-{sup.end:.2f}] {text}\n")
461
-
462
- elif str(output_path)[-5:].lower() == ".json":
463
- with open(output_path, "w", encoding="utf-8") as f:
464
- # Enhanced JSON export with word-level alignment
465
- json_data = []
466
- for sup in alignments:
467
- sup_dict = sup.to_dict()
468
- json_data.append(sup_dict)
469
- json.dump(json_data, f, ensure_ascii=False, indent=4)
470
- elif str(output_path).lower().endswith(".textgrid"):
471
- from tgt import Interval, IntervalTier, TextGrid, write_to_file
472
-
473
- tg = TextGrid()
474
- supervisions, words, scores = [], [], {"utterances": [], "words": []}
475
- for supervision in sorted(alignments, key=lambda x: x.start):
476
- # Respect `original_speaker` custom flag: default to include speaker when missing
477
- if (
478
- include_speaker_in_text
479
- and supervision.speaker is not None
480
- and (not supervision.has_custom("original_speaker") or supervision.custom["original_speaker"])
481
- ):
482
- text = f"{supervision.speaker} {supervision.text}"
483
- else:
484
- text = supervision.text
485
- supervisions.append(Interval(supervision.start, supervision.end, text or ""))
486
- # Extract word-level alignment using helper function
487
- word_items = cls._parse_alignment_from_supervision(supervision)
488
- if word_items:
489
- for item in word_items:
490
- words.append(Interval(item.start, item.end, item.symbol))
491
- if item.score is not None:
492
- scores["words"].append(Interval(item.start, item.end, f"{item.score:.2f}"))
493
- if supervision.has_custom("score"):
494
- scores["utterances"].append(
495
- Interval(supervision.start, supervision.end, f"{supervision.score:.2f}")
496
- )
497
-
498
- tg.add_tier(IntervalTier(name="utterances", objects=supervisions))
499
- if words:
500
- tg.add_tier(IntervalTier(name="words", objects=words))
501
-
502
- if scores["utterances"]:
503
- tg.add_tier(IntervalTier(name="utterance_scores", objects=scores["utterances"]))
504
- if scores["words"]:
505
- tg.add_tier(IntervalTier(name="word_scores", objects=scores["words"]))
506
-
507
- write_to_file(tg, output_path, format="long")
508
- elif str(output_path)[-4:].lower() == ".tsv":
509
- cls._write_tsv(alignments, output_path, include_speaker_in_text)
510
- elif str(output_path)[-4:].lower() == ".csv":
511
- cls._write_csv(alignments, output_path, include_speaker_in_text)
512
- elif str(output_path)[-4:].lower() == ".aud":
513
- cls._write_aud(alignments, output_path, include_speaker_in_text)
514
- elif str(output_path)[-4:].lower() == ".sbv":
515
- cls._write_sbv(alignments, output_path, include_speaker_in_text)
516
- else:
517
- import pysubs2
518
-
519
- subs = pysubs2.SSAFile()
520
- for sup in alignments:
521
- # Add word-level timing as metadata in the caption text
522
- word_items = cls._parse_alignment_from_supervision(sup)
523
- if word_items:
524
- for word in word_items:
525
- subs.append(
526
- pysubs2.SSAEvent(
527
- start=int(word.start * 1000),
528
- end=int(word.end * 1000),
529
- text=word.symbol,
530
- name=sup.speaker or "",
531
- )
532
- )
533
- else:
534
- if include_speaker_in_text and sup.speaker is not None:
535
- if not sup.has_custom("original_speaker") or sup.custom["original_speaker"]:
536
- text = f"{sup.speaker} {sup.text}"
537
- else:
538
- text = f"{sup.text}"
539
- else:
540
- text = sup.text
541
- subs.append(
542
- pysubs2.SSAEvent(
543
- start=int(sup.start * 1000),
544
- end=int(sup.end * 1000),
545
- text=text or "",
546
- name=sup.speaker or "",
547
- )
548
- )
549
-
550
- # MicroDVD format requires framerate to be specified
551
- output_ext = str(output_path).lower().split(".")[-1]
552
- if output_ext == "sub":
553
- # Default to 25 fps for MicroDVD format if not specified
554
- subs.save(output_path, fps=25.0)
555
- else:
556
- subs.save(output_path)
557
-
558
- return output_path
559
-
560
- @classmethod
561
- def _extract_metadata(cls, caption: Pathlike, format: Optional[str]) -> Dict[str, str]:
562
- """
563
- Extract metadata from caption file header.
564
-
565
- Args:
566
- caption: Caption file path or content
567
- format: Caption format
568
-
569
- Returns:
570
- Dictionary of metadata key-value pairs
571
- """
572
- metadata = {}
573
- caption_path = Path(str(caption))
574
-
575
- if not caption_path.exists():
576
- return metadata
577
-
578
- try:
579
- with open(caption_path, "r", encoding="utf-8") as f:
580
- content = f.read(2048) # Read first 2KB for metadata
581
-
582
- # WebVTT metadata extraction
583
- if format == "vtt" or content.startswith("WEBVTT"):
584
- lines = content.split("\n")
585
- for line in lines[:10]: # Check first 10 lines
586
- line = line.strip()
587
- if line.startswith("Kind:"):
588
- metadata["kind"] = line.split(":", 1)[1].strip()
589
- elif line.startswith("Language:"):
590
- metadata["language"] = line.split(":", 1)[1].strip()
591
- elif line.startswith("NOTE"):
592
- # Extract metadata from NOTE comments
593
- match = re.search(r"NOTE\s+(\w+):\s*(.+)", line)
594
- if match:
595
- key, value = match.groups()
596
- metadata[key.lower()] = value.strip()
597
-
598
- # SRT doesn't have standard metadata, but check for BOM
599
- elif format == "srt":
600
- if content.startswith("\ufeff"):
601
- metadata["encoding"] = "utf-8-sig"
602
-
603
- # TextGrid metadata
604
- elif format == "textgrid" or caption_path.suffix.lower() == ".textgrid":
605
- match = re.search(r"xmin\s*=\s*([\d.]+)", content)
606
- if match:
607
- metadata["xmin"] = match.group(1)
608
- match = re.search(r"xmax\s*=\s*([\d.]+)", content)
609
- if match:
610
- metadata["xmax"] = match.group(1)
611
-
612
- except Exception:
613
- # If metadata extraction fails, continue with empty metadata
614
- pass
615
-
616
- return metadata
617
-
618
- @classmethod
619
- def _parse_youtube_vtt_with_word_timestamps(
620
- cls, content: str, normalize_text: Optional[bool] = False
621
- ) -> List[Supervision]:
622
- """
623
- Parse YouTube VTT format with word-level timestamps.
624
-
625
- YouTube auto-generated captions use this format:
626
- Word1<00:00:10.559><c> Word2</c><00:00:11.120><c> Word3</c>...
627
-
628
- Args:
629
- content: VTT file content
630
- normalize_text: Whether to normalize text
631
-
632
- Returns:
633
- List of Supervision objects with word-level alignments
634
- """
635
- from lhotse.supervision import AlignmentItem
636
-
637
- supervisions = []
638
-
639
- # Pattern to match timestamp lines: 00:00:14.280 --> 00:00:17.269 align:start position:0%
640
- timestamp_pattern = re.compile(r"(\d{2}:\d{2}:\d{2}[.,]\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}[.,]\d{3})")
641
-
642
- # Pattern to match word-level timestamps: <00:00:10.559><c> word</c>
643
- word_timestamp_pattern = re.compile(r"<(\d{2}:\d{2}:\d{2}[.,]\d{3})><c>\s*([^<]+)</c>")
644
-
645
- # Pattern to match the first word (before first timestamp)
646
- first_word_pattern = re.compile(r"^([^<\n]+?)<(\d{2}:\d{2}:\d{2}[.,]\d{3})>")
647
-
648
- def parse_timestamp(ts: str) -> float:
649
- """Convert timestamp string to seconds."""
650
- ts = ts.replace(",", ".")
651
- parts = ts.split(":")
652
- hours = int(parts[0])
653
- minutes = int(parts[1])
654
- seconds = float(parts[2])
655
- return hours * 3600 + minutes * 60 + seconds
656
-
657
- lines = content.split("\n")
658
- i = 0
659
- while i < len(lines):
660
- line = lines[i].strip()
661
-
662
- # Look for timestamp line
663
- ts_match = timestamp_pattern.search(line)
664
- if ts_match:
665
- cue_start = parse_timestamp(ts_match.group(1))
666
- cue_end = parse_timestamp(ts_match.group(2))
667
-
668
- # Read the next non-empty lines for cue content
669
- cue_lines = []
670
- i += 1
671
- while i < len(lines) and lines[i].strip() and not timestamp_pattern.search(lines[i]):
672
- cue_lines.append(lines[i])
673
- i += 1
674
-
675
- # Process cue content
676
- for cue_line in cue_lines:
677
- cue_line = cue_line.strip()
678
- if not cue_line:
679
- continue
680
-
681
- # Check if this line has word-level timestamps
682
- word_matches = word_timestamp_pattern.findall(cue_line)
683
- if word_matches:
684
- # This line has word-level timing
685
- word_alignments = []
686
-
687
- # Get the first word (before the first timestamp)
688
- first_match = first_word_pattern.match(cue_line)
689
- if first_match:
690
- first_word = first_match.group(1).strip()
691
- first_word_next_ts = parse_timestamp(first_match.group(2))
692
- if first_word:
693
- # First word starts at cue_start
694
- word_alignments.append(
695
- AlignmentItem(
696
- symbol=first_word,
697
- start=cue_start,
698
- duration=first_word_next_ts - cue_start,
699
- )
700
- )
701
-
702
- # Process remaining words with timestamps
703
- for idx, (ts, word) in enumerate(word_matches):
704
- word_start = parse_timestamp(ts)
705
- word = word.strip()
706
- if not word:
707
- continue
708
-
709
- # Calculate duration based on next word's timestamp or cue end
710
- if idx + 1 < len(word_matches):
711
- next_ts = parse_timestamp(word_matches[idx + 1][0])
712
- duration = next_ts - word_start
713
- else:
714
- duration = cue_end - word_start
715
-
716
- word_alignments.append(
717
- AlignmentItem(
718
- symbol=word,
719
- start=word_start,
720
- duration=max(0.01, duration), # Ensure positive duration
721
- )
722
- )
723
-
724
- if word_alignments:
725
- # Create supervision with word-level alignment
726
- full_text = " ".join(item.symbol for item in word_alignments)
727
- if normalize_text:
728
- full_text = normalize_text_fn(full_text)
729
-
730
- sup_start = word_alignments[0].start
731
- sup_end = word_alignments[-1].start + word_alignments[-1].duration
732
-
733
- supervisions.append(
734
- Supervision(
735
- text=full_text,
736
- start=sup_start,
737
- duration=sup_end - sup_start,
738
- alignment={"word": word_alignments},
739
- )
740
- )
741
- else:
742
- # Plain text line without word-level timing - skip duplicate lines
743
- # (YouTube VTT often repeats the previous line without timestamps)
744
- pass
745
-
746
- continue
747
- i += 1
748
-
749
- # Merge consecutive supervisions to form complete utterances
750
- if supervisions:
751
- supervisions = cls._merge_youtube_vtt_supervisions(supervisions)
752
-
753
- return supervisions
754
-
755
- @classmethod
756
- def _merge_youtube_vtt_supervisions(cls, supervisions: List[Supervision]) -> List[Supervision]:
757
- """
758
- Merge consecutive YouTube VTT supervisions into complete utterances.
759
-
760
- YouTube VTT splits utterances across multiple cues. This method merges
761
- cues that are close together in time.
762
-
763
- Args:
764
- supervisions: List of supervisions to merge
765
-
766
- Returns:
767
- List of merged supervisions
768
- """
769
- if not supervisions:
770
- return supervisions
771
-
772
- merged = []
773
- current = supervisions[0]
774
-
775
- for next_sup in supervisions[1:]:
776
- # Check if next supervision is close enough to merge (within 0.5 seconds)
777
- gap = next_sup.start - (current.start + current.duration)
778
-
779
- if gap < 0.5 and current.alignment and next_sup.alignment:
780
- # Merge alignments
781
- current_words = current.alignment.get("word", [])
782
- next_words = next_sup.alignment.get("word", [])
783
- merged_words = list(current_words) + list(next_words)
784
-
785
- # Create merged supervision
786
- merged_text = current.text + " " + next_sup.text
787
- merged_end = next_sup.start + next_sup.duration
788
-
789
- current = Supervision(
790
- text=merged_text,
791
- start=current.start,
792
- duration=merged_end - current.start,
793
- alignment={"word": merged_words},
794
- )
795
- else:
796
- merged.append(current)
797
- current = next_sup
798
-
799
- merged.append(current)
800
- return merged
801
-
802
- @classmethod
803
- def _is_youtube_vtt_with_word_timestamps(cls, content: str) -> bool:
804
- """
805
- Check if content is YouTube VTT format with word-level timestamps.
806
-
807
- Args:
808
- content: File content to check
809
-
810
- Returns:
811
- True if content contains YouTube-style word timestamps
812
- """
813
- # Look for pattern like <00:00:10.559><c> word</c>
814
- return bool(re.search(r"<\d{2}:\d{2}:\d{2}[.,]\d{3}><c>", content))
815
-
816
- @classmethod
817
- def _parse_supervisions(
818
- cls, caption: Pathlike, format: Optional[str], normalize_text: Optional[bool] = False
819
- ) -> List[Supervision]:
820
- """
821
- Parse supervisions from caption file.
822
-
823
- Args:
824
- caption: Caption file path or content
825
- format: Caption format
826
- normalize_text: Whether to normalize text
827
-
828
- Returns:
829
- List of Supervision objects
830
- """
831
- if format:
832
- format = format.lower()
833
-
834
- # Check for YouTube VTT with word-level timestamps first
835
- caption_path = Path(str(caption))
836
- if caption_path.exists():
837
- with open(caption_path, "r", encoding="utf-8") as f:
838
- content = f.read()
839
- if cls._is_youtube_vtt_with_word_timestamps(content):
840
- return cls._parse_youtube_vtt_with_word_timestamps(content, normalize_text)
841
-
842
- # Match Gemini format: explicit format, or files ending with Gemini.md/Gemini3.md,
843
- # or files containing "gemini" in the name with .md extension
844
- caption_str = str(caption).lower()
845
- is_gemini_format = (
846
- format == "gemini"
847
- or str(caption).endswith("Gemini.md")
848
- or str(caption).endswith("Gemini3.md")
849
- or ("gemini" in caption_str and caption_str.endswith(".md"))
850
- )
851
- if is_gemini_format:
852
- from .gemini_reader import GeminiReader
853
-
854
- supervisions = GeminiReader.extract_for_alignment(caption)
855
- elif format and (format == "textgrid" or str(caption).lower().endswith("textgrid")):
856
- # Internel usage
857
- from tgt import read_textgrid
858
-
859
- tgt = read_textgrid(caption)
860
- supervisions = []
861
- for tier in tgt.tiers:
862
- supervisions.extend(
863
- [
864
- Supervision(
865
- text=interval.text,
866
- start=interval.start_time,
867
- duration=interval.end_time - interval.start_time,
868
- speaker=tier.name,
869
- )
870
- for interval in tier.intervals
871
- ]
872
- )
873
- supervisions = sorted(supervisions, key=lambda x: x.start)
874
- elif format == "tsv" or str(caption)[-4:].lower() == ".tsv":
875
- supervisions = cls._parse_tsv(caption, normalize_text)
876
- elif format == "csv" or str(caption)[-4:].lower() == ".csv":
877
- supervisions = cls._parse_csv(caption, normalize_text)
878
- elif format == "aud" or str(caption)[-4:].lower() == ".aud":
879
- supervisions = cls._parse_aud(caption, normalize_text)
880
- elif format == "sbv" or str(caption)[-4:].lower() == ".sbv":
881
- supervisions = cls._parse_sbv(caption, normalize_text)
882
- elif format == "txt" or (format == "auto" and str(caption)[-4:].lower() == ".txt"):
883
- if not Path(str(caption)).exists(): # str
884
- lines = [line.strip() for line in str(caption).split("\n")]
885
- else: # file
886
- path_str = str(caption)
887
- with open(path_str, encoding="utf-8") as f:
888
- lines = [line.strip() for line in f.readlines()]
889
- if normalize_text:
890
- lines = [normalize_text_fn(line) for line in lines]
891
- supervisions = []
892
- for line in lines:
893
- if line:
894
- # First try to parse timestamp format: [start-end] text
895
- start, end, remaining_text = parse_timestamp_text(line)
896
- if start is not None and end is not None:
897
- # Has timestamp, now check for speaker in the remaining text
898
- speaker, text = parse_speaker_text(remaining_text)
899
- supervisions.append(
900
- Supervision(
901
- text=text,
902
- start=start,
903
- duration=end - start,
904
- speaker=speaker,
905
- )
906
- )
907
- else:
908
- # No timestamp, just parse speaker and text
909
- speaker, text = parse_speaker_text(line)
910
- supervisions.append(Supervision(text=text, speaker=speaker))
911
- else:
912
- try:
913
- supervisions = cls._parse_caption(caption, format=format, normalize_text=normalize_text)
914
- except Exception as e:
915
- print(f"Failed to parse caption with Format: {format}, Exception: {e}, trying 'gemini' parser.")
916
- from .gemini_reader import GeminiReader
917
-
918
- supervisions = GeminiReader.extract_for_alignment(caption)
919
-
920
- return supervisions
921
-
922
- @classmethod
923
- def _parse_tsv(cls, caption: Pathlike, normalize_text: Optional[bool] = False) -> List[Supervision]:
924
- """
925
- Parse TSV (Tab-Separated Values) format caption file.
926
-
927
- Format specifications:
928
- - With speaker: speaker\tstart\tend\ttext
929
- - Without speaker: start\tend\ttext
930
- - Times are in milliseconds
931
-
932
- Args:
933
- caption: Caption file path
934
- normalize_text: Whether to normalize text
935
-
936
- Returns:
937
- List of Supervision objects
938
- """
939
- caption_path = Path(str(caption))
940
- if not caption_path.exists():
941
- raise FileNotFoundError(f"Caption file not found: {caption}")
942
-
943
- supervisions = []
944
-
945
- with open(caption_path, "r", encoding="utf-8") as f:
946
- lines = f.readlines()
947
-
948
- # Check if first line is a header
949
- first_line = lines[0].strip().lower()
950
- has_header = "start" in first_line and "end" in first_line and "text" in first_line
951
- has_speaker_column = "speaker" in first_line
952
-
953
- start_idx = 1 if has_header else 0
954
-
955
- for line in lines[start_idx:]:
956
- line = line.strip()
957
- if not line:
958
- continue
959
-
960
- parts = line.split("\t")
961
- if len(parts) < 3:
962
- continue
963
-
964
- try:
965
- if has_speaker_column and len(parts) >= 4:
966
- # Format: speaker\tstart\tend\ttext
967
- speaker = parts[0].strip() if parts[0].strip() else None
968
- start = float(parts[1]) / 1000.0 # Convert milliseconds to seconds
969
- end = float(parts[2]) / 1000.0
970
- text = "\t".join(parts[3:]).strip()
971
- else:
972
- # Format: start\tend\ttext
973
- start = float(parts[0]) / 1000.0 # Convert milliseconds to seconds
974
- end = float(parts[1]) / 1000.0
975
- text = "\t".join(parts[2:]).strip()
976
- speaker = None
977
-
978
- if normalize_text:
979
- text = normalize_text_fn(text)
980
-
981
- duration = end - start
982
- if duration < 0:
983
- continue
984
-
985
- supervisions.append(
986
- Supervision(
987
- text=text,
988
- start=start,
989
- duration=duration,
990
- speaker=speaker,
991
- )
992
- )
993
- except (ValueError, IndexError):
994
- # Skip malformed lines
995
- continue
996
-
997
- return supervisions
998
-
999
- @classmethod
1000
- def _parse_csv(cls, caption: Pathlike, normalize_text: Optional[bool] = False) -> List[Supervision]:
1001
- """
1002
- Parse CSV (Comma-Separated Values) format caption file.
1003
-
1004
- Format specifications:
1005
- - With speaker: speaker,start,end,text
1006
- - Without speaker: start,end,text
1007
- - Times are in milliseconds
1008
-
1009
- Args:
1010
- caption: Caption file path
1011
- normalize_text: Whether to normalize text
1012
-
1013
- Returns:
1014
- List of Supervision objects
1015
- """
1016
- import csv
1017
-
1018
- caption_path = Path(str(caption))
1019
- if not caption_path.exists():
1020
- raise FileNotFoundError(f"Caption file not found: {caption}")
1021
-
1022
- supervisions = []
1023
-
1024
- with open(caption_path, "r", encoding="utf-8", newline="") as f:
1025
- reader = csv.reader(f)
1026
- lines = list(reader)
1027
-
1028
- if not lines:
1029
- return supervisions
1030
-
1031
- # Check if first line is a header
1032
- first_line = [col.strip().lower() for col in lines[0]]
1033
- has_header = "start" in first_line and "end" in first_line and "text" in first_line
1034
- has_speaker_column = "speaker" in first_line
1035
-
1036
- start_idx = 1 if has_header else 0
1037
-
1038
- for parts in lines[start_idx:]:
1039
- if len(parts) < 3:
1040
- continue
1041
-
1042
- try:
1043
- if has_speaker_column and len(parts) >= 4:
1044
- # Format: speaker,start,end,text
1045
- speaker = parts[0].strip() if parts[0].strip() else None
1046
- start = float(parts[1]) / 1000.0 # Convert milliseconds to seconds
1047
- end = float(parts[2]) / 1000.0
1048
- text = ",".join(parts[3:]).strip()
1049
- else:
1050
- # Format: start,end,text
1051
- start = float(parts[0]) / 1000.0 # Convert milliseconds to seconds
1052
- end = float(parts[1]) / 1000.0
1053
- text = ",".join(parts[2:]).strip()
1054
- speaker = None
1055
-
1056
- if normalize_text:
1057
- text = normalize_text_fn(text)
1058
-
1059
- duration = end - start
1060
- if duration < 0:
1061
- continue
1062
-
1063
- supervisions.append(
1064
- Supervision(
1065
- text=text,
1066
- start=start,
1067
- duration=duration,
1068
- speaker=speaker,
1069
- )
1070
- )
1071
- except (ValueError, IndexError):
1072
- # Skip malformed lines
1073
- continue
1074
-
1075
- return supervisions
1076
-
1077
- @classmethod
1078
- def _parse_aud(cls, caption: Pathlike, normalize_text: Optional[bool] = False) -> List[Supervision]:
1079
- """
1080
- Parse AUD (Audacity Labels) format caption file.
1081
-
1082
- Format: start\tend\t[[speaker]]text
1083
- - Times are in seconds (float)
1084
- - Speaker is optional and enclosed in [[brackets]]
1085
-
1086
- Args:
1087
- caption: Caption file path
1088
- normalize_text: Whether to normalize text
1089
-
1090
- Returns:
1091
- List of Supervision objects
1092
- """
1093
- caption_path = Path(str(caption))
1094
- if not caption_path.exists():
1095
- raise FileNotFoundError(f"Caption file not found: {caption}")
1096
-
1097
- supervisions = []
1098
-
1099
- with open(caption_path, "r", encoding="utf-8") as f:
1100
- lines = f.readlines()
1101
-
1102
- for line in lines:
1103
- line = line.strip()
1104
- if not line:
1105
- continue
1106
-
1107
- parts = line.split("\t")
1108
- if len(parts) < 3:
1109
- continue
1110
-
1111
- try:
1112
- # AUD format: start\tend\ttext (speaker in [[brackets]])
1113
- start = float(parts[0])
1114
- end = float(parts[1])
1115
- text = "\t".join(parts[2:]).strip()
1116
-
1117
- # Extract speaker from [[speaker]] prefix
1118
- speaker = None
1119
- speaker_match = re.match(r"^\[\[([^\]]+)\]\]\s*(.*)$", text)
1120
- if speaker_match:
1121
- speaker = speaker_match.group(1)
1122
- text = speaker_match.group(2)
1123
-
1124
- if normalize_text:
1125
- text = normalize_text_fn(text)
1126
-
1127
- duration = end - start
1128
- if duration < 0:
1129
- continue
1130
-
1131
- supervisions.append(
1132
- Supervision(
1133
- text=text,
1134
- start=start,
1135
- duration=duration,
1136
- speaker=speaker,
1137
- )
1138
- )
1139
- except (ValueError, IndexError):
1140
- # Skip malformed lines
1141
- continue
1142
-
1143
- return supervisions
1144
-
1145
- @classmethod
1146
- def _parse_sbv(cls, caption: Pathlike, normalize_text: Optional[bool] = False) -> List[Supervision]:
1147
- """
1148
- Parse SubViewer (SBV) format caption file.
1149
-
1150
- Format:
1151
- 0:00:00.000,0:00:02.000
1152
- Text line 1
1153
-
1154
- 0:00:02.000,0:00:04.000
1155
- Text line 2
1156
-
1157
- Args:
1158
- caption: Caption file path
1159
- normalize_text: Whether to normalize text
1160
-
1161
- Returns:
1162
- List of Supervision objects
1163
- """
1164
- caption_path = Path(str(caption))
1165
- if not caption_path.exists():
1166
- raise FileNotFoundError(f"Caption file not found: {caption}")
1167
-
1168
- supervisions = []
1169
-
1170
- with open(caption_path, "r", encoding="utf-8") as f:
1171
- content = f.read()
1172
-
1173
- # Split by double newlines to separate entries
1174
- entries = content.strip().split("\n\n")
1175
-
1176
- for entry in entries:
1177
- lines = entry.strip().split("\n")
1178
- if len(lines) < 2:
1179
- continue
1180
-
1181
- # First line: timestamp (H:MM:SS.mmm,H:MM:SS.mmm)
1182
- timestamp_line = lines[0].strip()
1183
- # Remaining lines: text
1184
- text_lines = lines[1:]
1185
-
1186
- try:
1187
- # Parse timestamp: 0:00:00.000,0:00:02.000
1188
- if "," not in timestamp_line:
1189
- continue
1190
-
1191
- start_str, end_str = timestamp_line.split(",", 1)
1192
-
1193
- # Parse start time
1194
- start_parts = start_str.strip().split(":")
1195
- if len(start_parts) == 3:
1196
- h, m, s = start_parts
1197
- s_parts = s.split(".")
1198
- start = int(h) * 3600 + int(m) * 60 + int(s_parts[0])
1199
- if len(s_parts) > 1:
1200
- start += int(s_parts[1]) / 1000.0
1201
- else:
1202
- continue
1203
-
1204
- # Parse end time
1205
- end_parts = end_str.strip().split(":")
1206
- if len(end_parts) == 3:
1207
- h, m, s = end_parts
1208
- s_parts = s.split(".")
1209
- end = int(h) * 3600 + int(m) * 60 + int(s_parts[0])
1210
- if len(s_parts) > 1:
1211
- end += int(s_parts[1]) / 1000.0
1212
- else:
1213
- continue
1214
-
1215
- # Parse text and speaker
1216
- text = " ".join(text_lines).strip()
1217
- speaker, text = parse_speaker_text(text)
1218
-
1219
- if normalize_text:
1220
- text = normalize_text_fn(text)
1221
-
1222
- duration = end - start
1223
- if duration < 0:
1224
- continue
1225
-
1226
- supervisions.append(
1227
- Supervision(
1228
- text=text,
1229
- start=start,
1230
- duration=duration,
1231
- speaker=speaker,
1232
- )
1233
- )
1234
- except (ValueError, IndexError):
1235
- # Skip malformed entries
1236
- continue
1237
-
1238
- return supervisions
1239
-
1240
- @classmethod
1241
- def _write_tsv(
1242
- cls,
1243
- alignments: List[Supervision],
1244
- output_path: Pathlike,
1245
- include_speaker_in_text: bool = True,
1246
- ) -> None:
1247
- """
1248
- Write caption to TSV format.
1249
-
1250
- Format: speaker\tstart\tend\ttext (with speaker)
1251
- or: start\tend\ttext (without speaker)
1252
-
1253
- Args:
1254
- alignments: List of supervision segments to write
1255
- output_path: Path to output TSV file
1256
- include_speaker_in_text: Whether to include speaker column
1257
- """
1258
- with open(output_path, "w", encoding="utf-8") as file:
1259
- # Write header
1260
- if include_speaker_in_text:
1261
- file.write("speaker\tstart\tend\ttext\n")
1262
- for supervision in alignments:
1263
- # Respect `original_speaker` custom flag: default to True when missing
1264
- include_speaker = supervision.speaker and (
1265
- not supervision.has_custom("original_speaker") or supervision.custom["original_speaker"]
1266
- )
1267
- speaker = supervision.speaker if include_speaker else ""
1268
- start_ms = round(1000 * supervision.start)
1269
- end_ms = round(1000 * supervision.end)
1270
- text = supervision.text.strip().replace("\t", " ")
1271
- file.write(f"{speaker}\t{start_ms}\t{end_ms}\t{text}\n")
1272
- else:
1273
- file.write("start\tend\ttext\n")
1274
- for supervision in alignments:
1275
- start_ms = round(1000 * supervision.start)
1276
- end_ms = round(1000 * supervision.end)
1277
- text = supervision.text.strip().replace("\t", " ")
1278
- file.write(f"{start_ms}\t{end_ms}\t{text}\n")
1279
-
1280
- @classmethod
1281
- def _write_csv(
1282
- cls,
1283
- alignments: List[Supervision],
1284
- output_path: Pathlike,
1285
- include_speaker_in_text: bool = True,
1286
- ) -> None:
1287
- """
1288
- Write caption to CSV format.
1289
-
1290
- Format: speaker,start,end,text (with speaker)
1291
- or: start,end,text (without speaker)
1292
-
1293
- Args:
1294
- alignments: List of supervision segments to write
1295
- output_path: Path to output CSV file
1296
- include_speaker_in_text: Whether to include speaker column
1297
- """
1298
- import csv
1299
-
1300
- with open(output_path, "w", encoding="utf-8", newline="") as file:
1301
- if include_speaker_in_text:
1302
- writer = csv.writer(file)
1303
- writer.writerow(["speaker", "start", "end", "text"])
1304
- for supervision in alignments:
1305
- include_speaker = supervision.speaker and (
1306
- not supervision.has_custom("original_speaker") or supervision.custom["original_speaker"]
1307
- )
1308
- speaker = supervision.speaker if include_speaker else ""
1309
- start_ms = round(1000 * supervision.start)
1310
- end_ms = round(1000 * supervision.end)
1311
- text = supervision.text.strip()
1312
- writer.writerow([speaker, start_ms, end_ms, text])
1313
- else:
1314
- writer = csv.writer(file)
1315
- writer.writerow(["start", "end", "text"])
1316
- for supervision in alignments:
1317
- start_ms = round(1000 * supervision.start)
1318
- end_ms = round(1000 * supervision.end)
1319
- text = supervision.text.strip()
1320
- writer.writerow([start_ms, end_ms, text])
1321
-
1322
- @classmethod
1323
- def _write_aud(
1324
- cls,
1325
- alignments: List[Supervision],
1326
- output_path: Pathlike,
1327
- include_speaker_in_text: bool = True,
1328
- ) -> None:
1329
- """
1330
- Write caption to AUD format.
1331
-
1332
- Format: start\tend\t[[speaker]]text
1333
- or: start\tend\ttext (without speaker)
1334
-
1335
- Args:
1336
- alignments: List of supervision segments to write
1337
- output_path: Path to output AUD file
1338
- include_speaker_in_text: Whether to include speaker in [[brackets]]
1339
- """
1340
- with open(output_path, "w", encoding="utf-8") as file:
1341
- for supervision in alignments:
1342
- start = supervision.start
1343
- end = supervision.end
1344
- text = supervision.text.strip().replace("\t", " ")
1345
-
1346
- # Respect `original_speaker` custom flag when adding speaker prefix
1347
- if (
1348
- include_speaker_in_text
1349
- and supervision.speaker
1350
- and (not supervision.has_custom("original_speaker") or supervision.custom["original_speaker"])
1351
- ):
1352
- text = f"[[{supervision.speaker}]]{text}"
1353
-
1354
- file.write(f"{start}\t{end}\t{text}\n")
1355
-
1356
- @classmethod
1357
- def _write_sbv(
1358
- cls,
1359
- alignments: List[Supervision],
1360
- output_path: Pathlike,
1361
- include_speaker_in_text: bool = True,
1362
- ) -> None:
1363
- """
1364
- Write caption to SubViewer (SBV) format.
1365
-
1366
- Format:
1367
- 0:00:00.000,0:00:02.000
1368
- Text line 1
1369
-
1370
- 0:00:02.000,0:00:04.000
1371
- Text line 2
1372
-
1373
- Args:
1374
- alignments: List of supervision segments to write
1375
- output_path: Path to output SBV file
1376
- include_speaker_in_text: Whether to include speaker in text
1377
- """
1378
- with open(output_path, "w", encoding="utf-8") as file:
1379
- for i, supervision in enumerate(alignments):
1380
- # Format timestamps as H:MM:SS.mmm
1381
- start_h = int(supervision.start // 3600)
1382
- start_m = int((supervision.start % 3600) // 60)
1383
- start_s = int(supervision.start % 60)
1384
- start_ms = int((supervision.start % 1) * 1000)
1385
-
1386
- end_h = int(supervision.end // 3600)
1387
- end_m = int((supervision.end % 3600) // 60)
1388
- end_s = int(supervision.end % 60)
1389
- end_ms = int((supervision.end % 1) * 1000)
1390
-
1391
- start_time = f"{start_h}:{start_m:02d}:{start_s:02d}.{start_ms:03d}"
1392
- end_time = f"{end_h}:{end_m:02d}:{end_s:02d}.{end_ms:03d}"
1393
-
1394
- # Write timestamp line
1395
- file.write(f"{start_time},{end_time}\n")
1396
-
1397
- # Write text (with optional speaker). Respect `original_speaker` custom flag.
1398
- text = supervision.text.strip()
1399
- if (
1400
- include_speaker_in_text
1401
- and supervision.speaker
1402
- and (not supervision.has_custom("original_speaker") or supervision.custom["original_speaker"])
1403
- ):
1404
- text = f"{supervision.speaker}: {text}"
1405
-
1406
- file.write(f"{text}\n")
1407
-
1408
- # Add blank line between entries (except after last one)
1409
- if i < len(alignments) - 1:
1410
- file.write("\n")
1411
-
1412
- @classmethod
1413
- def _parse_caption(
1414
- cls, caption: Pathlike, format: Optional[OutputCaptionFormat], normalize_text: Optional[bool] = False
1415
- ) -> List[Supervision]:
1416
- """
1417
- Parse caption using pysubs2.
1418
-
1419
- Args:
1420
- caption: Caption file path or content
1421
- format: Caption format
1422
- normalize_text: Whether to normalize text
1423
-
1424
- Returns:
1425
- List of Supervision objects
1426
- """
1427
- import pysubs2
1428
-
1429
- try:
1430
- subs: pysubs2.SSAFile = pysubs2.load(
1431
- caption, encoding="utf-8", format_=format if format != "auto" else None
1432
- ) # file
1433
- except IOError:
1434
- try:
1435
- subs: pysubs2.SSAFile = pysubs2.SSAFile.from_string(
1436
- caption, format_=format if format != "auto" else None
1437
- ) # str
1438
- except Exception as e:
1439
- del e
1440
- subs: pysubs2.SSAFile = pysubs2.load(caption, encoding="utf-8") # auto detect format
1441
-
1442
- # Parse supervisions
1443
- supervisions = []
1444
- for event in subs.events:
1445
- if normalize_text:
1446
- event.text = normalize_text_fn(event.text)
1447
- speaker, text = parse_speaker_text(event.text)
1448
- supervisions.append(
1449
- Supervision(
1450
- text=text,
1451
- speaker=speaker or event.name,
1452
- start=event.start / 1000.0 if event.start is not None else None,
1453
- duration=(event.end - event.start) / 1000.0 if event.end is not None else None,
1454
- )
1455
- )
1456
- return supervisions
1457
-
1458
652
  def __repr__(self) -> str:
1459
653
  """String representation of Caption."""
1460
654
  lang = f"lang={self.language}" if self.language else "lang=unknown"