lattifai 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. lattifai/__init__.py +0 -24
  2. lattifai/alignment/__init__.py +10 -1
  3. lattifai/alignment/lattice1_aligner.py +66 -58
  4. lattifai/alignment/lattice1_worker.py +1 -6
  5. lattifai/alignment/punctuation.py +38 -0
  6. lattifai/alignment/segmenter.py +1 -1
  7. lattifai/alignment/sentence_splitter.py +350 -0
  8. lattifai/alignment/text_align.py +440 -0
  9. lattifai/alignment/tokenizer.py +91 -220
  10. lattifai/caption/__init__.py +82 -6
  11. lattifai/caption/caption.py +335 -1143
  12. lattifai/caption/formats/__init__.py +199 -0
  13. lattifai/caption/formats/base.py +211 -0
  14. lattifai/caption/formats/gemini.py +722 -0
  15. lattifai/caption/formats/json.py +194 -0
  16. lattifai/caption/formats/lrc.py +309 -0
  17. lattifai/caption/formats/nle/__init__.py +9 -0
  18. lattifai/caption/formats/nle/audition.py +561 -0
  19. lattifai/caption/formats/nle/avid.py +423 -0
  20. lattifai/caption/formats/nle/fcpxml.py +549 -0
  21. lattifai/caption/formats/nle/premiere.py +589 -0
  22. lattifai/caption/formats/pysubs2.py +642 -0
  23. lattifai/caption/formats/sbv.py +147 -0
  24. lattifai/caption/formats/tabular.py +338 -0
  25. lattifai/caption/formats/textgrid.py +193 -0
  26. lattifai/caption/formats/ttml.py +652 -0
  27. lattifai/caption/formats/vtt.py +469 -0
  28. lattifai/caption/parsers/__init__.py +9 -0
  29. lattifai/caption/{text_parser.py → parsers/text_parser.py} +4 -2
  30. lattifai/caption/standardize.py +636 -0
  31. lattifai/caption/utils.py +474 -0
  32. lattifai/cli/__init__.py +2 -1
  33. lattifai/cli/caption.py +108 -1
  34. lattifai/cli/transcribe.py +4 -9
  35. lattifai/cli/youtube.py +4 -1
  36. lattifai/client.py +48 -84
  37. lattifai/config/__init__.py +11 -1
  38. lattifai/config/alignment.py +9 -2
  39. lattifai/config/caption.py +267 -23
  40. lattifai/config/media.py +20 -0
  41. lattifai/diarization/__init__.py +41 -1
  42. lattifai/mixin.py +36 -18
  43. lattifai/transcription/base.py +6 -1
  44. lattifai/transcription/lattifai.py +19 -54
  45. lattifai/utils.py +81 -13
  46. lattifai/workflow/__init__.py +28 -4
  47. lattifai/workflow/file_manager.py +2 -5
  48. lattifai/youtube/__init__.py +43 -0
  49. lattifai/youtube/client.py +1170 -0
  50. lattifai/youtube/types.py +23 -0
  51. lattifai-1.2.2.dist-info/METADATA +615 -0
  52. lattifai-1.2.2.dist-info/RECORD +76 -0
  53. {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/entry_points.txt +1 -2
  54. lattifai/caption/gemini_reader.py +0 -371
  55. lattifai/caption/gemini_writer.py +0 -173
  56. lattifai/cli/app_installer.py +0 -142
  57. lattifai/cli/server.py +0 -44
  58. lattifai/server/app.py +0 -427
  59. lattifai/workflow/youtube.py +0 -577
  60. lattifai-1.2.0.dist-info/METADATA +0 -1133
  61. lattifai-1.2.0.dist-info/RECORD +0 -57
  62. {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/WHEEL +0 -0
  63. {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/licenses/LICENSE +0 -0
  64. {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/top_level.txt +0 -0
@@ -1,19 +1,22 @@
1
1
  """Caption data structure for storing subtitle information with metadata."""
2
2
 
3
- import json
4
- import re
3
+ from __future__ import annotations
4
+
5
+ import io
5
6
  from dataclasses import dataclass, field
6
7
  from pathlib import Path
7
- from typing import Any, Dict, List, Optional, TypeVar
8
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeVar, Union
9
+
10
+ if TYPE_CHECKING:
11
+ from ..config.caption import KaraokeConfig
8
12
 
9
13
  from lhotse.supervision import AlignmentItem
10
14
  from lhotse.utils import Pathlike
11
15
  from tgt import TextGrid
12
16
 
13
17
  from ..config.caption import InputCaptionFormat, OutputCaptionFormat # noqa: F401
18
+ from .formats import detect_format, get_reader, get_writer
14
19
  from .supervision import Supervision
15
- from .text_parser import normalize_text as normalize_text_fn
16
- from .text_parser import parse_speaker_text, parse_timestamp_text
17
20
 
18
21
  DiarizationOutput = TypeVar("DiarizationOutput")
19
22
 
@@ -50,7 +53,7 @@ class Caption:
50
53
  kind: Optional[str] = None
51
54
  source_format: Optional[str] = None
52
55
  source_path: Optional[Pathlike] = None
53
- metadata: Dict[str, str] = field(default_factory=dict)
56
+ metadata: Dict[str, Any] = field(default_factory=dict)
54
57
 
55
58
  def __len__(self) -> int:
56
59
  """Return the number of supervision segments."""
@@ -66,7 +69,7 @@ class Caption:
66
69
 
67
70
  def __bool__(self) -> bool:
68
71
  """Return True if caption has supervisions."""
69
- return self.__len__() > 0
72
+ return len(self) > 0
70
73
 
71
74
  @property
72
75
  def is_empty(self) -> bool:
@@ -147,19 +150,72 @@ class Caption:
147
150
  Returns:
148
151
  New Caption instance with shifted timestamps
149
152
  """
150
- shifted_sups = [
151
- Supervision(
152
- text=sup.text,
153
- start=sup.start + seconds,
154
- duration=sup.duration,
155
- speaker=sup.speaker,
156
- id=sup.id,
157
- language=sup.language,
158
- alignment=sup.alignment if hasattr(sup, "alignment") else None,
159
- custom=sup.custom,
153
+ shifted_sups = []
154
+ for sup in self.supervisions:
155
+ # Calculate physical time range
156
+ raw_start = sup.start + seconds
157
+ raw_end = sup.end + seconds
158
+
159
+ # Skip segments that end before 0
160
+ if raw_end <= 0:
161
+ continue
162
+
163
+ # Clip start to 0 if negative
164
+ if raw_start < 0:
165
+ final_start = 0.0
166
+ final_duration = raw_end
167
+ else:
168
+ final_start = raw_start
169
+ final_duration = sup.duration
170
+
171
+ # Handle alignment (word-level timestamps)
172
+ final_alignment = None
173
+ original_alignment = getattr(sup, "alignment", None)
174
+ if original_alignment and "word" in original_alignment:
175
+ new_words = []
176
+ for word in original_alignment["word"]:
177
+ w_start = word.start + seconds
178
+ w_end = w_start + word.duration
179
+
180
+ # Skip words that end before 0
181
+ if w_end <= 0:
182
+ continue
183
+
184
+ # Clip start to 0 if negative
185
+ if w_start < 0:
186
+ w_final_start = 0.0
187
+ w_final_duration = w_end
188
+ else:
189
+ w_final_start = w_start
190
+ w_final_duration = word.duration
191
+
192
+ new_words.append(
193
+ AlignmentItem(
194
+ symbol=word.symbol,
195
+ start=w_final_start,
196
+ duration=w_final_duration,
197
+ score=word.score,
198
+ )
199
+ )
200
+
201
+ # Copy original alignment dict structure and update words
202
+ final_alignment = original_alignment.copy()
203
+ final_alignment["word"] = new_words
204
+
205
+ shifted_sups.append(
206
+ Supervision(
207
+ text=sup.text,
208
+ start=final_start,
209
+ duration=final_duration,
210
+ speaker=sup.speaker,
211
+ id=sup.id,
212
+ recording_id=sup.recording_id if hasattr(sup, "recording_id") else "",
213
+ channel=getattr(sup, "channel", 0),
214
+ language=sup.language,
215
+ alignment=final_alignment,
216
+ custom=sup.custom,
217
+ )
160
218
  )
161
- for sup in self.supervisions
162
- ]
163
219
 
164
220
  return Caption(
165
221
  supervisions=shifted_sups,
@@ -170,52 +226,90 @@ class Caption:
170
226
  metadata=self.metadata.copy(),
171
227
  )
172
228
 
173
- def to_string(self, format: str = "srt") -> str:
229
+ def with_margins(
230
+ self,
231
+ start_margin: float = 0.08,
232
+ end_margin: float = 0.20,
233
+ min_gap: float = 0.08,
234
+ collision_mode: str = "trim",
235
+ ) -> "Caption":
174
236
  """
175
- Return caption content in specified format.
237
+ Create a new Caption with segment boundaries adjusted based on word-level alignment.
238
+
239
+ Uses supervision.alignment['word'] to recalculate segment start/end times
240
+ with the specified margins applied around the actual speech boundaries.
176
241
 
177
242
  Args:
178
- format: Output format (e.g., 'srt', 'vtt', 'ass')
243
+ start_margin: Seconds to extend before the first word (default: 0.08)
244
+ end_margin: Seconds to extend after the last word (default: 0.20)
245
+ min_gap: Minimum gap between segments for collision handling (default: 0.08)
246
+ collision_mode: How to handle segment overlap - 'trim' or 'gap' (default: 'trim')
179
247
 
180
248
  Returns:
181
- String containing formatted captions
182
- """
183
- import pysubs2
249
+ New Caption instance with adjusted timestamps
250
+
251
+ Note:
252
+ Segments without alignment data will keep their original timestamps.
184
253
 
185
- subs = pysubs2.SSAFile()
254
+ Example:
255
+ >>> caption = Caption.read("aligned.srt")
256
+ >>> adjusted = caption.with_margins(start_margin=0.05, end_margin=0.15)
257
+ >>> adjusted.write("output.srt")
258
+ """
259
+ from .standardize import apply_margins_to_captions
186
260
 
261
+ # Determine which supervisions to use
187
262
  if self.alignments:
188
- alignments = self.alignments
263
+ source_sups = self.alignments
264
+ elif self.supervisions:
265
+ source_sups = self.supervisions
189
266
  else:
190
- alignments = self.supervisions
191
-
192
- if not alignments:
193
- alignments = self.transcription
194
-
195
- for sup in alignments:
196
- # Add word-level timing as metadata in the caption text
197
- word_items = self._parse_alignment_from_supervision(sup)
198
- if word_items:
199
- for word in word_items:
200
- subs.append(
201
- pysubs2.SSAEvent(
202
- start=int(word.start * 1000),
203
- end=int(word.end * 1000),
204
- text=word.symbol,
205
- name=sup.speaker or "",
206
- )
207
- )
208
- else:
209
- subs.append(
210
- pysubs2.SSAEvent(
211
- start=int(sup.start * 1000),
212
- end=int(sup.end * 1000),
213
- text=sup.text or "",
214
- name=sup.speaker or "",
215
- )
216
- )
267
+ source_sups = self.transcription
268
+
269
+ adjusted_sups = apply_margins_to_captions(
270
+ source_sups,
271
+ start_margin=start_margin,
272
+ end_margin=end_margin,
273
+ min_gap=min_gap,
274
+ collision_mode=collision_mode,
275
+ )
276
+
277
+ return Caption(
278
+ supervisions=adjusted_sups,
279
+ transcription=self.transcription,
280
+ audio_events=self.audio_events,
281
+ speaker_diarization=self.speaker_diarization,
282
+ alignments=[], # Clear alignments since we've applied them
283
+ language=self.language,
284
+ kind=self.kind,
285
+ source_format=self.source_format,
286
+ source_path=self.source_path,
287
+ metadata=self.metadata.copy(),
288
+ )
289
+
290
+ def to_string(
291
+ self,
292
+ format: str = "srt",
293
+ word_level: bool = False,
294
+ karaoke_config: Optional["KaraokeConfig"] = None,
295
+ metadata: Optional[Dict[str, Any]] = None,
296
+ ) -> str:
297
+ """
298
+ Return caption content in specified format.
217
299
 
218
- return subs.to_string(format_=format)
300
+ Args:
301
+ format: Output format (e.g., 'srt', 'vtt', 'ass')
302
+ word_level: Use word-level output format if supported (e.g., LRC, ASS, TTML)
303
+ karaoke_config: Karaoke configuration. When provided with enabled=True,
304
+ enables karaoke styling (ASS \\kf tags, enhanced LRC, etc.)
305
+ metadata: Optional metadata dict to pass to writer. If None, uses self.metadata.
306
+
307
+ Returns:
308
+ String containing formatted captions
309
+ """
310
+ return self.to_bytes(
311
+ output_format=format, word_level=word_level, karaoke_config=karaoke_config, metadata=metadata
312
+ ).decode("utf-8")
219
313
 
220
314
  def to_dict(self) -> Dict:
221
315
  """
@@ -269,6 +363,71 @@ class Caption:
269
363
  metadata=metadata or {},
270
364
  )
271
365
 
366
+ @classmethod
367
+ def from_string(
368
+ cls,
369
+ content: str,
370
+ format: str,
371
+ normalize_text: bool = True,
372
+ ) -> "Caption":
373
+ """
374
+ Create Caption from string content.
375
+
376
+ Args:
377
+ content: Caption content as string
378
+ format: Caption format (e.g., 'srt', 'vtt', 'ass')
379
+ normalize_text: Whether to normalize text during reading
380
+
381
+ Returns:
382
+ New Caption instance
383
+
384
+ Example:
385
+ >>> srt_content = \"\"\"1
386
+ ... 00:00:00,000 --> 00:00:02,000
387
+ ... Hello world\"\"\"
388
+ >>> caption = Caption.from_string(srt_content, format=\"srt\")
389
+ """
390
+ buffer = io.StringIO(content)
391
+ return cls.read(buffer, format=format, normalize_text=normalize_text)
392
+
393
+ def to_bytes(
394
+ self,
395
+ output_format: Optional[str] = None,
396
+ include_speaker_in_text: bool = True,
397
+ word_level: bool = False,
398
+ karaoke_config: Optional["KaraokeConfig"] = None,
399
+ metadata: Optional[Dict[str, Any]] = None,
400
+ ) -> bytes:
401
+ """
402
+ Convert caption to bytes.
403
+
404
+ Args:
405
+ output_format: Output format (e.g., 'srt', 'vtt', 'ass'). Defaults to source_format or 'srt'
406
+ include_speaker_in_text: Whether to include speaker labels in text
407
+ word_level: Use word-level output format if supported (e.g., LRC, ASS, TTML)
408
+ karaoke_config: Karaoke configuration. When provided with enabled=True,
409
+ enables karaoke styling (ASS \\kf tags, enhanced LRC, etc.)
410
+ metadata: Optional metadata dict to pass to writer. If None, uses self.metadata.
411
+
412
+ Returns:
413
+ Caption content as bytes
414
+
415
+ Example:
416
+ >>> caption = Caption.read("input.srt")
417
+ >>> # Get as bytes in original format
418
+ >>> data = caption.to_bytes()
419
+ >>> # Get as bytes in specific format
420
+ >>> vtt_data = caption.to_bytes(output_format="vtt")
421
+ """
422
+ return self.write(
423
+ None,
424
+ output_format=output_format,
425
+ include_speaker_in_text=include_speaker_in_text,
426
+ word_level=word_level,
427
+ karaoke_config=karaoke_config,
428
+ metadata=metadata,
429
+ )
430
+
272
431
  @classmethod
273
432
  def from_transcription_results(
274
433
  cls,
@@ -307,82 +466,168 @@ class Caption:
307
466
  @classmethod
308
467
  def read(
309
468
  cls,
310
- path: Pathlike,
469
+ path: Union[Pathlike, io.BytesIO, io.StringIO],
311
470
  format: Optional[str] = None,
312
471
  normalize_text: bool = True,
313
472
  ) -> "Caption":
314
473
  """
315
- Read caption file and return Caption object.
474
+ Read caption file or in-memory data and return Caption object.
316
475
 
317
476
  Args:
318
- path: Path to caption file
319
- format: Caption format (auto-detected if not provided)
477
+ path: Path to caption file, or BytesIO/StringIO object with caption content
478
+ format: Caption format (auto-detected if not provided, required for in-memory data)
320
479
  normalize_text: Whether to normalize text during reading
321
480
 
322
481
  Returns:
323
482
  Caption object containing supervisions and metadata
324
-
325
- Example:
326
- >>> caption = Caption.read("subtitles.srt")
327
- >>> print(f"Loaded {len(caption)} segments")
328
483
  """
329
- caption_path = Path(str(path)) if not isinstance(path, Path) else path
330
-
331
484
  # Detect format if not provided
332
- if not format and caption_path.exists():
333
- format = caption_path.suffix.lstrip(".").lower()
334
- elif format:
335
- format = format.lower()
485
+ if not format:
486
+ if isinstance(path, (io.BytesIO, io.StringIO)):
487
+ raise ValueError("format parameter is required when reading from BytesIO/StringIO")
488
+ format = detect_format(str(path))
489
+
490
+ if not format:
491
+ # Fallback to extension
492
+ if not isinstance(path, (io.BytesIO, io.StringIO)):
493
+ format = Path(str(path)).suffix.lstrip(".").lower()
494
+
495
+ if not format:
496
+ format = "srt" # Last resort default
336
497
 
337
- # Extract metadata from file
338
- metadata = cls._extract_metadata(path, format)
498
+ # Get content if it's an in-memory buffer
499
+ source = path
500
+ if isinstance(path, io.BytesIO):
501
+ source = path.read().decode("utf-8")
502
+ elif isinstance(path, io.StringIO):
503
+ source = path.read()
339
504
 
340
- # Parse supervisions
341
- supervisions = cls._parse_supervisions(path, format, normalize_text)
505
+ # Reset buffer position if it was a stream
506
+ if isinstance(path, (io.BytesIO, io.StringIO)):
507
+ path.seek(0)
508
+
509
+ # Get reader and perform extraction
510
+ reader_cls = get_reader(format)
511
+ if not reader_cls:
512
+ # Use pysubs2 as a generic fallback if no specific reader exists
513
+ from .formats.pysubs2 import Pysubs2Format
514
+
515
+ reader_cls = Pysubs2Format
516
+
517
+ supervisions = reader_cls.read(source, normalize_text=normalize_text)
518
+ metadata = reader_cls.extract_metadata(source)
342
519
 
343
520
  # Create Caption object
521
+ source_path = None
522
+ if isinstance(path, (str, Path)) and not ("\n" in str(path) or len(str(path)) > 500):
523
+ try:
524
+ p = Path(str(path))
525
+ if p.exists():
526
+ source_path = str(p)
527
+ except (OSError, ValueError):
528
+ pass
529
+
344
530
  return cls(
345
531
  supervisions=supervisions,
346
532
  language=metadata.get("language"),
347
533
  kind=metadata.get("kind"),
348
534
  source_format=format,
349
- source_path=str(caption_path) if caption_path.exists() else None,
535
+ source_path=source_path,
350
536
  metadata=metadata,
351
537
  )
352
538
 
353
539
  def write(
354
540
  self,
355
- path: Pathlike,
541
+ path: Union[Pathlike, io.BytesIO, None] = None,
542
+ output_format: Optional[str] = None,
356
543
  include_speaker_in_text: bool = True,
357
- ) -> Pathlike:
544
+ word_level: bool = False,
545
+ karaoke_config: Optional["KaraokeConfig"] = None,
546
+ metadata: Optional[Dict[str, Any]] = None,
547
+ ) -> Union[Pathlike, bytes]:
358
548
  """
359
- Write caption to file.
549
+ Write caption to file or return as bytes.
360
550
 
361
551
  Args:
362
- path: Path to output caption file
552
+ path: Path to output caption file, BytesIO object, or None to return bytes
553
+ output_format: Output format (e.g., 'srt', 'vtt', 'ass')
363
554
  include_speaker_in_text: Whether to include speaker labels in text
555
+ word_level: Use word-level output format if supported (e.g., LRC, ASS, TTML)
556
+ karaoke_config: Karaoke configuration. When provided with enabled=True,
557
+ enables karaoke styling (ASS \\kf tags, enhanced LRC, etc.)
558
+ metadata: Optional metadata dict to pass to writer. If None, uses self.metadata.
559
+ Can be used to override or supplement format-specific metadata.
364
560
 
365
561
  Returns:
366
- Path to the written file
367
-
368
- Example:
369
- >>> caption = Caption.read("input.srt")
370
- >>> caption.write("output.vtt", include_speaker_in_text=False)
562
+ Path to the written file if path is a file path, or bytes if path is BytesIO/None
371
563
  """
372
564
  if self.alignments:
373
- alignments = self.alignments
565
+ supervisions = self.alignments
566
+ elif self.supervisions:
567
+ supervisions = self.supervisions
374
568
  else:
375
- alignments = self.supervisions
376
-
377
- if not alignments:
378
- alignments = self.transcription
569
+ supervisions = self.transcription
570
+
571
+ # Merge external metadata with self.metadata (external takes precedence)
572
+ effective_metadata = dict(self.metadata) if self.metadata else {}
573
+ if metadata:
574
+ effective_metadata.update(metadata)
575
+
576
+ # Determine output format
577
+ if output_format:
578
+ output_format = output_format.lower()
579
+ elif isinstance(path, (io.BytesIO, type(None))):
580
+ output_format = self.source_format or "srt"
581
+ else:
582
+ output_format = detect_format(str(path)) or Path(str(path)).suffix.lstrip(".").lower() or "srt"
583
+
584
+ # Special casing for professional formats as before
585
+ ext = output_format
586
+ if isinstance(path, (str, Path)):
587
+ path_str = str(path)
588
+ if path_str.endswith("_avid.txt"):
589
+ ext = "avid_ds"
590
+ elif "audition" in path_str.lower() and path_str.endswith(".csv"):
591
+ ext = "audition_csv"
592
+ elif "edimarker" in path_str.lower() and path_str.endswith(".csv"):
593
+ ext = "edimarker_csv"
594
+ elif "imsc" in path_str.lower() and path_str.endswith(".ttml"):
595
+ ext = "imsc1"
596
+ elif "ebu" in path_str.lower() and path_str.endswith(".ttml"):
597
+ ext = "ebu_tt_d"
598
+
599
+ writer_cls = get_writer(ext)
600
+ if not writer_cls:
601
+ from .formats.pysubs2 import Pysubs2Format
602
+
603
+ writer_cls = Pysubs2Format
604
+
605
+ if isinstance(path, (str, Path)):
606
+ return writer_cls.write(
607
+ supervisions,
608
+ path,
609
+ include_speaker=include_speaker_in_text,
610
+ word_level=word_level,
611
+ karaoke_config=karaoke_config,
612
+ metadata=effective_metadata,
613
+ )
379
614
 
380
- return self._write_caption(alignments, path, include_speaker_in_text)
615
+ content = writer_cls.to_bytes(
616
+ supervisions,
617
+ include_speaker=include_speaker_in_text,
618
+ word_level=word_level,
619
+ karaoke_config=karaoke_config,
620
+ metadata=effective_metadata,
621
+ )
622
+ if isinstance(path, io.BytesIO):
623
+ path.write(content)
624
+ path.seek(0)
625
+ return content
381
626
 
382
627
  def read_speaker_diarization(
383
628
  self,
384
629
  path: Pathlike,
385
- ) -> TextGrid:
630
+ ) -> "DiarizationOutput":
386
631
  """
387
632
  Read speaker diarization TextGrid from file.
388
633
  """
@@ -404,1059 +649,6 @@ class Caption:
404
649
  self.speaker_diarization.write(path)
405
650
  return path
406
651
 
407
- @staticmethod
408
- def _parse_alignment_from_supervision(supervision: Any) -> Optional[List[AlignmentItem]]:
409
- """
410
- Extract word-level alignment items from Supervision object.
411
-
412
- Args:
413
- supervision: Supervision object with potential alignment data
414
-
415
- Returns:
416
- List of AlignmentItem objects, or None if no alignment data present
417
- """
418
- if not hasattr(supervision, "alignment") or not supervision.alignment:
419
- return None
420
-
421
- if "word" not in supervision.alignment:
422
- return None
423
-
424
- return supervision.alignment["word"]
425
-
426
- @classmethod
427
- def _write_caption(
428
- cls,
429
- alignments: List[Supervision],
430
- output_path: Pathlike,
431
- include_speaker_in_text: bool = True,
432
- ) -> Pathlike:
433
- """
434
- Write caption to file in various formats.
435
-
436
- Args:
437
- alignments: List of supervision segments to write
438
- output_path: Path to output file
439
- include_speaker_in_text: Whether to include speaker in text
440
-
441
- Returns:
442
- Path to written file
443
- """
444
- if str(output_path)[-4:].lower() == ".txt":
445
- with open(output_path, "w", encoding="utf-8") as f:
446
- for sup in alignments:
447
- word_items = cls._parse_alignment_from_supervision(sup)
448
- if word_items:
449
- for item in word_items:
450
- f.write(f"[{item.start:.2f}-{item.end:.2f}] {item.symbol}\n")
451
- else:
452
- if include_speaker_in_text and sup.speaker is not None:
453
- # Use [SPEAKER]: format for consistency with parsing
454
- if not sup.has_custom("original_speaker") or sup.custom["original_speaker"]:
455
- text = f"[{sup.speaker}]: {sup.text}"
456
- else:
457
- text = f"{sup.text}"
458
- else:
459
- text = sup.text
460
- f.write(f"[{sup.start:.2f}-{sup.end:.2f}] {text}\n")
461
-
462
- elif str(output_path)[-5:].lower() == ".json":
463
- with open(output_path, "w", encoding="utf-8") as f:
464
- # Enhanced JSON export with word-level alignment
465
- json_data = []
466
- for sup in alignments:
467
- sup_dict = sup.to_dict()
468
- json_data.append(sup_dict)
469
- json.dump(json_data, f, ensure_ascii=False, indent=4)
470
-
471
- elif str(output_path).lower().endswith(".textgrid"):
472
- from tgt import Interval, IntervalTier, TextGrid, write_to_file
473
-
474
- tg = TextGrid()
475
- supervisions, words, scores = [], [], {"utterances": [], "words": []}
476
- for supervision in sorted(alignments, key=lambda x: x.start):
477
- # Respect `original_speaker` custom flag: default to include speaker when missing
478
- if (
479
- include_speaker_in_text
480
- and supervision.speaker is not None
481
- and (not supervision.has_custom("original_speaker") or supervision.custom["original_speaker"])
482
- ):
483
- text = f"{supervision.speaker} {supervision.text}"
484
- else:
485
- text = supervision.text
486
- supervisions.append(Interval(supervision.start, supervision.end, text or ""))
487
- # Extract word-level alignment using helper function
488
- word_items = cls._parse_alignment_from_supervision(supervision)
489
- if word_items:
490
- for item in word_items:
491
- words.append(Interval(item.start, item.end, item.symbol))
492
- if item.score is not None:
493
- scores["words"].append(Interval(item.start, item.end, f"{item.score:.2f}"))
494
- if supervision.has_custom("score"):
495
- scores["utterances"].append(
496
- Interval(supervision.start, supervision.end, f"{supervision.score:.2f}")
497
- )
498
-
499
- tg.add_tier(IntervalTier(name="utterances", objects=supervisions))
500
- if words:
501
- tg.add_tier(IntervalTier(name="words", objects=words))
502
-
503
- if scores["utterances"]:
504
- tg.add_tier(IntervalTier(name="utterance_scores", objects=scores["utterances"]))
505
- if scores["words"]:
506
- tg.add_tier(IntervalTier(name="word_scores", objects=scores["words"]))
507
-
508
- write_to_file(tg, output_path, format="long")
509
-
510
- elif str(output_path)[-4:].lower() == ".tsv":
511
- cls._write_tsv(alignments, output_path, include_speaker_in_text)
512
- elif str(output_path)[-4:].lower() == ".csv":
513
- cls._write_csv(alignments, output_path, include_speaker_in_text)
514
- elif str(output_path)[-4:].lower() == ".aud":
515
- cls._write_aud(alignments, output_path, include_speaker_in_text)
516
- elif str(output_path)[-4:].lower() == ".sbv":
517
- cls._write_sbv(alignments, output_path, include_speaker_in_text)
518
- else:
519
- import pysubs2
520
-
521
- subs = pysubs2.SSAFile()
522
- for sup in alignments:
523
- # Add word-level timing as metadata in the caption text
524
- word_items = cls._parse_alignment_from_supervision(sup)
525
- if word_items:
526
- for word in word_items:
527
- subs.append(
528
- pysubs2.SSAEvent(
529
- start=int(word.start * 1000),
530
- end=int(word.end * 1000),
531
- text=word.symbol,
532
- name=sup.speaker or "",
533
- )
534
- )
535
- else:
536
- if include_speaker_in_text and sup.speaker is not None:
537
- if not sup.has_custom("original_speaker") or sup.custom["original_speaker"]:
538
- text = f"{sup.speaker} {sup.text}"
539
- else:
540
- text = f"{sup.text}"
541
- else:
542
- text = sup.text
543
- subs.append(
544
- pysubs2.SSAEvent(
545
- start=int(sup.start * 1000),
546
- end=int(sup.end * 1000),
547
- text=text or "",
548
- name=sup.speaker or "",
549
- )
550
- )
551
-
552
- # MicroDVD format requires framerate to be specified
553
- output_ext = str(output_path).lower().split(".")[-1]
554
- if output_ext == "sub":
555
- # Default to 25 fps for MicroDVD format if not specified
556
- subs.save(output_path, fps=25.0)
557
- else:
558
- subs.save(output_path)
559
-
560
- return output_path
561
-
562
- @classmethod
563
- def _extract_metadata(cls, caption: Pathlike, format: Optional[str]) -> Dict[str, str]:
564
- """
565
- Extract metadata from caption file header.
566
-
567
- Args:
568
- caption: Caption file path or content
569
- format: Caption format
570
-
571
- Returns:
572
- Dictionary of metadata key-value pairs
573
- """
574
- metadata = {}
575
- caption_path = Path(str(caption))
576
-
577
- if not caption_path.exists():
578
- return metadata
579
-
580
- try:
581
- with open(caption_path, "r", encoding="utf-8") as f:
582
- content = f.read(2048) # Read first 2KB for metadata
583
-
584
- # WebVTT metadata extraction
585
- if format == "vtt" or content.startswith("WEBVTT"):
586
- lines = content.split("\n")
587
- for line in lines[:10]: # Check first 10 lines
588
- line = line.strip()
589
- if line.startswith("Kind:"):
590
- metadata["kind"] = line.split(":", 1)[1].strip()
591
- elif line.startswith("Language:"):
592
- metadata["language"] = line.split(":", 1)[1].strip()
593
- elif line.startswith("NOTE"):
594
- # Extract metadata from NOTE comments
595
- match = re.search(r"NOTE\s+(\w+):\s*(.+)", line)
596
- if match:
597
- key, value = match.groups()
598
- metadata[key.lower()] = value.strip()
599
-
600
- # SRT doesn't have standard metadata, but check for BOM
601
- elif format == "srt":
602
- if content.startswith("\ufeff"):
603
- metadata["encoding"] = "utf-8-sig"
604
-
605
- # TextGrid metadata
606
- elif format == "textgrid" or caption_path.suffix.lower() == ".textgrid":
607
- match = re.search(r"xmin\s*=\s*([\d.]+)", content)
608
- if match:
609
- metadata["xmin"] = match.group(1)
610
- match = re.search(r"xmax\s*=\s*([\d.]+)", content)
611
- if match:
612
- metadata["xmax"] = match.group(1)
613
-
614
- except Exception:
615
- # If metadata extraction fails, continue with empty metadata
616
- pass
617
-
618
- return metadata
619
-
620
- @classmethod
621
- def _parse_youtube_vtt_with_word_timestamps(
622
- cls, content: str, normalize_text: Optional[bool] = False
623
- ) -> List[Supervision]:
624
- """
625
- Parse YouTube VTT format with word-level timestamps.
626
-
627
- YouTube auto-generated captions use this format:
628
- Word1<00:00:10.559><c> Word2</c><00:00:11.120><c> Word3</c>...
629
-
630
- Args:
631
- content: VTT file content
632
- normalize_text: Whether to normalize text
633
-
634
- Returns:
635
- List of Supervision objects with word-level alignments
636
- """
637
- from lhotse.supervision import AlignmentItem
638
-
639
- supervisions = []
640
-
641
- # Pattern to match timestamp lines: 00:00:14.280 --> 00:00:17.269 align:start position:0%
642
- timestamp_pattern = re.compile(r"(\d{2}:\d{2}:\d{2}[.,]\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}[.,]\d{3})")
643
-
644
- # Pattern to match word-level timestamps: <00:00:10.559><c> word</c>
645
- word_timestamp_pattern = re.compile(r"<(\d{2}:\d{2}:\d{2}[.,]\d{3})><c>\s*([^<]+)</c>")
646
-
647
- # Pattern to match the first word (before first timestamp)
648
- first_word_pattern = re.compile(r"^([^<\n]+?)<(\d{2}:\d{2}:\d{2}[.,]\d{3})>")
649
-
650
- def parse_timestamp(ts: str) -> float:
651
- """Convert timestamp string to seconds."""
652
- ts = ts.replace(",", ".")
653
- parts = ts.split(":")
654
- hours = int(parts[0])
655
- minutes = int(parts[1])
656
- seconds = float(parts[2])
657
- return hours * 3600 + minutes * 60 + seconds
658
-
659
- lines = content.split("\n")
660
- i = 0
661
- while i < len(lines):
662
- line = lines[i].strip()
663
-
664
- # Look for timestamp line
665
- ts_match = timestamp_pattern.search(line)
666
- if ts_match:
667
- cue_start = parse_timestamp(ts_match.group(1))
668
- cue_end = parse_timestamp(ts_match.group(2))
669
-
670
- # Read the next non-empty lines for cue content
671
- cue_lines = []
672
- i += 1
673
- while i < len(lines) and lines[i].strip() and not timestamp_pattern.search(lines[i]):
674
- cue_lines.append(lines[i])
675
- i += 1
676
-
677
- # Process cue content
678
- for cue_line in cue_lines:
679
- cue_line = cue_line.strip()
680
- if not cue_line:
681
- continue
682
-
683
- # Check if this line has word-level timestamps
684
- word_matches = word_timestamp_pattern.findall(cue_line)
685
- if word_matches:
686
- # This line has word-level timing
687
- word_alignments = []
688
-
689
- # Get the first word (before the first timestamp)
690
- first_match = first_word_pattern.match(cue_line)
691
- if first_match:
692
- first_word = first_match.group(1).strip()
693
- first_word_next_ts = parse_timestamp(first_match.group(2))
694
- if first_word:
695
- # First word starts at cue_start
696
- word_alignments.append(
697
- AlignmentItem(
698
- symbol=first_word,
699
- start=cue_start,
700
- duration=first_word_next_ts - cue_start,
701
- )
702
- )
703
-
704
- # Process remaining words with timestamps
705
- for idx, (ts, word) in enumerate(word_matches):
706
- word_start = parse_timestamp(ts)
707
- word = word.strip()
708
- if not word:
709
- continue
710
-
711
- # Calculate duration based on next word's timestamp or cue end
712
- if idx + 1 < len(word_matches):
713
- next_ts = parse_timestamp(word_matches[idx + 1][0])
714
- duration = next_ts - word_start
715
- else:
716
- duration = cue_end - word_start
717
-
718
- word_alignments.append(
719
- AlignmentItem(
720
- symbol=word,
721
- start=word_start,
722
- duration=max(0.01, duration), # Ensure positive duration
723
- )
724
- )
725
-
726
- if word_alignments:
727
- # Create supervision with word-level alignment
728
- full_text = " ".join(item.symbol for item in word_alignments)
729
- if normalize_text:
730
- full_text = normalize_text_fn(full_text)
731
-
732
- sup_start = word_alignments[0].start
733
- sup_end = word_alignments[-1].start + word_alignments[-1].duration
734
-
735
- supervisions.append(
736
- Supervision(
737
- text=full_text,
738
- start=sup_start,
739
- duration=sup_end - sup_start,
740
- alignment={"word": word_alignments},
741
- )
742
- )
743
- else:
744
- # Plain text line without word-level timing - skip duplicate lines
745
- # (YouTube VTT often repeats the previous line without timestamps)
746
- pass
747
-
748
- continue
749
- i += 1
750
-
751
- # Merge consecutive supervisions to form complete utterances
752
- if supervisions:
753
- supervisions = cls._merge_youtube_vtt_supervisions(supervisions)
754
-
755
- return supervisions
756
-
757
- @classmethod
758
- def _merge_youtube_vtt_supervisions(cls, supervisions: List[Supervision]) -> List[Supervision]:
759
- """
760
- Merge consecutive YouTube VTT supervisions into complete utterances.
761
-
762
- YouTube VTT splits utterances across multiple cues. This method merges
763
- cues that are close together in time.
764
-
765
- Args:
766
- supervisions: List of supervisions to merge
767
-
768
- Returns:
769
- List of merged supervisions
770
- """
771
- if not supervisions:
772
- return supervisions
773
-
774
- merged = []
775
- current = supervisions[0]
776
-
777
- for next_sup in supervisions[1:]:
778
- # Check if next supervision is close enough to merge (within 0.5 seconds)
779
- gap = next_sup.start - (current.start + current.duration)
780
-
781
- if gap < 0.5 and current.alignment and next_sup.alignment:
782
- # Merge alignments
783
- current_words = current.alignment.get("word", [])
784
- next_words = next_sup.alignment.get("word", [])
785
- merged_words = list(current_words) + list(next_words)
786
-
787
- # Create merged supervision
788
- merged_text = current.text + " " + next_sup.text
789
- merged_end = next_sup.start + next_sup.duration
790
-
791
- current = Supervision(
792
- text=merged_text,
793
- start=current.start,
794
- duration=merged_end - current.start,
795
- alignment={"word": merged_words},
796
- )
797
- else:
798
- merged.append(current)
799
- current = next_sup
800
-
801
- merged.append(current)
802
- return merged
803
-
804
- @classmethod
805
- def _is_youtube_vtt_with_word_timestamps(cls, content: str) -> bool:
806
- """
807
- Check if content is YouTube VTT format with word-level timestamps.
808
-
809
- Args:
810
- content: File content to check
811
-
812
- Returns:
813
- True if content contains YouTube-style word timestamps
814
- """
815
- # Look for pattern like <00:00:10.559><c> word</c>
816
- return bool(re.search(r"<\d{2}:\d{2}:\d{2}[.,]\d{3}><c>", content))
817
-
818
- @classmethod
819
- def _parse_supervisions(
820
- cls, caption: Pathlike, format: Optional[str], normalize_text: Optional[bool] = False
821
- ) -> List[Supervision]:
822
- """
823
- Parse supervisions from caption file.
824
-
825
- Args:
826
- caption: Caption file path or content
827
- format: Caption format
828
- normalize_text: Whether to normalize text
829
-
830
- Returns:
831
- List of Supervision objects
832
- """
833
- if format:
834
- format = format.lower()
835
-
836
- # Check for YouTube VTT with word-level timestamps first
837
- caption_path = Path(str(caption))
838
- if caption_path.exists():
839
- with open(caption_path, "r", encoding="utf-8") as f:
840
- content = f.read()
841
- if cls._is_youtube_vtt_with_word_timestamps(content):
842
- return cls._parse_youtube_vtt_with_word_timestamps(content, normalize_text)
843
-
844
- # Match Gemini format: explicit format, or files ending with Gemini.md/Gemini3.md,
845
- # or files containing "gemini" in the name with .md extension
846
- caption_str = str(caption).lower()
847
- is_gemini_format = (
848
- format == "gemini"
849
- or str(caption).endswith("Gemini.md")
850
- or str(caption).endswith("Gemini3.md")
851
- or ("gemini" in caption_str and caption_str.endswith(".md"))
852
- )
853
- if is_gemini_format:
854
- from .gemini_reader import GeminiReader
855
-
856
- supervisions = GeminiReader.extract_for_alignment(caption)
857
- elif format and (format == "textgrid" or str(caption).lower().endswith("textgrid")):
858
- # Internel usage
859
- from tgt import read_textgrid
860
-
861
- tgt = read_textgrid(caption)
862
- supervisions = []
863
- for tier in tgt.tiers:
864
- supervisions.extend(
865
- [
866
- Supervision(
867
- text=interval.text,
868
- start=interval.start_time,
869
- duration=interval.end_time - interval.start_time,
870
- speaker=tier.name,
871
- )
872
- for interval in tier.intervals
873
- ]
874
- )
875
- supervisions = sorted(supervisions, key=lambda x: x.start)
876
- elif format == "tsv" or str(caption)[-4:].lower() == ".tsv":
877
- supervisions = cls._parse_tsv(caption, normalize_text)
878
- elif format == "csv" or str(caption)[-4:].lower() == ".csv":
879
- supervisions = cls._parse_csv(caption, normalize_text)
880
- elif format == "aud" or str(caption)[-4:].lower() == ".aud":
881
- supervisions = cls._parse_aud(caption, normalize_text)
882
- elif format == "sbv" or str(caption)[-4:].lower() == ".sbv":
883
- supervisions = cls._parse_sbv(caption, normalize_text)
884
- elif format == "txt" or (format == "auto" and str(caption)[-4:].lower() == ".txt"):
885
- if not Path(str(caption)).exists(): # str
886
- lines = [line.strip() for line in str(caption).split("\n")]
887
- else: # file
888
- path_str = str(caption)
889
- with open(path_str, encoding="utf-8") as f:
890
- lines = [line.strip() for line in f.readlines()]
891
- if normalize_text:
892
- lines = [normalize_text_fn(line) for line in lines]
893
- supervisions = []
894
- for line in lines:
895
- if line:
896
- # First try to parse timestamp format: [start-end] text
897
- start, end, remaining_text = parse_timestamp_text(line)
898
- if start is not None and end is not None:
899
- # Has timestamp, now check for speaker in the remaining text
900
- speaker, text = parse_speaker_text(remaining_text)
901
- supervisions.append(
902
- Supervision(
903
- text=text,
904
- start=start,
905
- duration=end - start,
906
- speaker=speaker,
907
- )
908
- )
909
- else:
910
- # No timestamp, just parse speaker and text
911
- speaker, text = parse_speaker_text(line)
912
- supervisions.append(Supervision(text=text, speaker=speaker))
913
- else:
914
- try:
915
- supervisions = cls._parse_caption(caption, format=format, normalize_text=normalize_text)
916
- except Exception as e:
917
- print(f"Failed to parse caption with Format: {format}, Exception: {e}, trying 'gemini' parser.")
918
- from .gemini_reader import GeminiReader
919
-
920
- supervisions = GeminiReader.extract_for_alignment(caption)
921
-
922
- return supervisions
923
-
924
- @classmethod
925
- def _parse_tsv(cls, caption: Pathlike, normalize_text: Optional[bool] = False) -> List[Supervision]:
926
- """
927
- Parse TSV (Tab-Separated Values) format caption file.
928
-
929
- Format specifications:
930
- - With speaker: speaker\tstart\tend\ttext
931
- - Without speaker: start\tend\ttext
932
- - Times are in milliseconds
933
-
934
- Args:
935
- caption: Caption file path
936
- normalize_text: Whether to normalize text
937
-
938
- Returns:
939
- List of Supervision objects
940
- """
941
- caption_path = Path(str(caption))
942
- if not caption_path.exists():
943
- raise FileNotFoundError(f"Caption file not found: {caption}")
944
-
945
- supervisions = []
946
-
947
- with open(caption_path, "r", encoding="utf-8") as f:
948
- lines = f.readlines()
949
-
950
- # Check if first line is a header
951
- first_line = lines[0].strip().lower()
952
- has_header = "start" in first_line and "end" in first_line and "text" in first_line
953
- has_speaker_column = "speaker" in first_line
954
-
955
- start_idx = 1 if has_header else 0
956
-
957
- for line in lines[start_idx:]:
958
- line = line.strip()
959
- if not line:
960
- continue
961
-
962
- parts = line.split("\t")
963
- if len(parts) < 3:
964
- continue
965
-
966
- try:
967
- if has_speaker_column and len(parts) >= 4:
968
- # Format: speaker\tstart\tend\ttext
969
- speaker = parts[0].strip() if parts[0].strip() else None
970
- start = float(parts[1]) / 1000.0 # Convert milliseconds to seconds
971
- end = float(parts[2]) / 1000.0
972
- text = "\t".join(parts[3:]).strip()
973
- else:
974
- # Format: start\tend\ttext
975
- start = float(parts[0]) / 1000.0 # Convert milliseconds to seconds
976
- end = float(parts[1]) / 1000.0
977
- text = "\t".join(parts[2:]).strip()
978
- speaker = None
979
-
980
- if normalize_text:
981
- text = normalize_text_fn(text)
982
-
983
- duration = end - start
984
- if duration < 0:
985
- continue
986
-
987
- supervisions.append(
988
- Supervision(
989
- text=text,
990
- start=start,
991
- duration=duration,
992
- speaker=speaker,
993
- )
994
- )
995
- except (ValueError, IndexError):
996
- # Skip malformed lines
997
- continue
998
-
999
- return supervisions
1000
-
1001
- @classmethod
1002
- def _parse_csv(cls, caption: Pathlike, normalize_text: Optional[bool] = False) -> List[Supervision]:
1003
- """
1004
- Parse CSV (Comma-Separated Values) format caption file.
1005
-
1006
- Format specifications:
1007
- - With speaker: speaker,start,end,text
1008
- - Without speaker: start,end,text
1009
- - Times are in milliseconds
1010
-
1011
- Args:
1012
- caption: Caption file path
1013
- normalize_text: Whether to normalize text
1014
-
1015
- Returns:
1016
- List of Supervision objects
1017
- """
1018
- import csv
1019
-
1020
- caption_path = Path(str(caption))
1021
- if not caption_path.exists():
1022
- raise FileNotFoundError(f"Caption file not found: {caption}")
1023
-
1024
- supervisions = []
1025
-
1026
- with open(caption_path, "r", encoding="utf-8", newline="") as f:
1027
- reader = csv.reader(f)
1028
- lines = list(reader)
1029
-
1030
- if not lines:
1031
- return supervisions
1032
-
1033
- # Check if first line is a header
1034
- first_line = [col.strip().lower() for col in lines[0]]
1035
- has_header = "start" in first_line and "end" in first_line and "text" in first_line
1036
- has_speaker_column = "speaker" in first_line
1037
-
1038
- start_idx = 1 if has_header else 0
1039
-
1040
- for parts in lines[start_idx:]:
1041
- if len(parts) < 3:
1042
- continue
1043
-
1044
- try:
1045
- if has_speaker_column and len(parts) >= 4:
1046
- # Format: speaker,start,end,text
1047
- speaker = parts[0].strip() if parts[0].strip() else None
1048
- start = float(parts[1]) / 1000.0 # Convert milliseconds to seconds
1049
- end = float(parts[2]) / 1000.0
1050
- text = ",".join(parts[3:]).strip()
1051
- else:
1052
- # Format: start,end,text
1053
- start = float(parts[0]) / 1000.0 # Convert milliseconds to seconds
1054
- end = float(parts[1]) / 1000.0
1055
- text = ",".join(parts[2:]).strip()
1056
- speaker = None
1057
-
1058
- if normalize_text:
1059
- text = normalize_text_fn(text)
1060
-
1061
- duration = end - start
1062
- if duration < 0:
1063
- continue
1064
-
1065
- supervisions.append(
1066
- Supervision(
1067
- text=text,
1068
- start=start,
1069
- duration=duration,
1070
- speaker=speaker,
1071
- )
1072
- )
1073
- except (ValueError, IndexError):
1074
- # Skip malformed lines
1075
- continue
1076
-
1077
- return supervisions
1078
-
1079
- @classmethod
1080
- def _parse_aud(cls, caption: Pathlike, normalize_text: Optional[bool] = False) -> List[Supervision]:
1081
- """
1082
- Parse AUD (Audacity Labels) format caption file.
1083
-
1084
- Format: start\tend\t[[speaker]]text
1085
- - Times are in seconds (float)
1086
- - Speaker is optional and enclosed in [[brackets]]
1087
-
1088
- Args:
1089
- caption: Caption file path
1090
- normalize_text: Whether to normalize text
1091
-
1092
- Returns:
1093
- List of Supervision objects
1094
- """
1095
- caption_path = Path(str(caption))
1096
- if not caption_path.exists():
1097
- raise FileNotFoundError(f"Caption file not found: {caption}")
1098
-
1099
- supervisions = []
1100
-
1101
- with open(caption_path, "r", encoding="utf-8") as f:
1102
- lines = f.readlines()
1103
-
1104
- for line in lines:
1105
- line = line.strip()
1106
- if not line:
1107
- continue
1108
-
1109
- parts = line.split("\t")
1110
- if len(parts) < 3:
1111
- continue
1112
-
1113
- try:
1114
- # AUD format: start\tend\ttext (speaker in [[brackets]])
1115
- start = float(parts[0])
1116
- end = float(parts[1])
1117
- text = "\t".join(parts[2:]).strip()
1118
-
1119
- # Extract speaker from [[speaker]] prefix
1120
- speaker = None
1121
- speaker_match = re.match(r"^\[\[([^\]]+)\]\]\s*(.*)$", text)
1122
- if speaker_match:
1123
- speaker = speaker_match.group(1)
1124
- text = speaker_match.group(2)
1125
-
1126
- if normalize_text:
1127
- text = normalize_text_fn(text)
1128
-
1129
- duration = end - start
1130
- if duration < 0:
1131
- continue
1132
-
1133
- supervisions.append(
1134
- Supervision(
1135
- text=text,
1136
- start=start,
1137
- duration=duration,
1138
- speaker=speaker,
1139
- )
1140
- )
1141
- except (ValueError, IndexError):
1142
- # Skip malformed lines
1143
- continue
1144
-
1145
- return supervisions
1146
-
1147
- @classmethod
1148
- def _parse_sbv(cls, caption: Pathlike, normalize_text: Optional[bool] = False) -> List[Supervision]:
1149
- """
1150
- Parse SubViewer (SBV) format caption file.
1151
-
1152
- Format:
1153
- 0:00:00.000,0:00:02.000
1154
- Text line 1
1155
-
1156
- 0:00:02.000,0:00:04.000
1157
- Text line 2
1158
-
1159
- Args:
1160
- caption: Caption file path
1161
- normalize_text: Whether to normalize text
1162
-
1163
- Returns:
1164
- List of Supervision objects
1165
- """
1166
- caption_path = Path(str(caption))
1167
- if not caption_path.exists():
1168
- raise FileNotFoundError(f"Caption file not found: {caption}")
1169
-
1170
- supervisions = []
1171
-
1172
- with open(caption_path, "r", encoding="utf-8") as f:
1173
- content = f.read()
1174
-
1175
- # Split by double newlines to separate entries
1176
- entries = content.strip().split("\n\n")
1177
-
1178
- for entry in entries:
1179
- lines = entry.strip().split("\n")
1180
- if len(lines) < 2:
1181
- continue
1182
-
1183
- # First line: timestamp (H:MM:SS.mmm,H:MM:SS.mmm)
1184
- timestamp_line = lines[0].strip()
1185
- # Remaining lines: text
1186
- text_lines = lines[1:]
1187
-
1188
- try:
1189
- # Parse timestamp: 0:00:00.000,0:00:02.000
1190
- if "," not in timestamp_line:
1191
- continue
1192
-
1193
- start_str, end_str = timestamp_line.split(",", 1)
1194
-
1195
- # Parse start time
1196
- start_parts = start_str.strip().split(":")
1197
- if len(start_parts) == 3:
1198
- h, m, s = start_parts
1199
- s_parts = s.split(".")
1200
- start = int(h) * 3600 + int(m) * 60 + int(s_parts[0])
1201
- if len(s_parts) > 1:
1202
- start += int(s_parts[1]) / 1000.0
1203
- else:
1204
- continue
1205
-
1206
- # Parse end time
1207
- end_parts = end_str.strip().split(":")
1208
- if len(end_parts) == 3:
1209
- h, m, s = end_parts
1210
- s_parts = s.split(".")
1211
- end = int(h) * 3600 + int(m) * 60 + int(s_parts[0])
1212
- if len(s_parts) > 1:
1213
- end += int(s_parts[1]) / 1000.0
1214
- else:
1215
- continue
1216
-
1217
- # Parse text and speaker
1218
- text = " ".join(text_lines).strip()
1219
- speaker, text = parse_speaker_text(text)
1220
-
1221
- if normalize_text:
1222
- text = normalize_text_fn(text)
1223
-
1224
- duration = end - start
1225
- if duration < 0:
1226
- continue
1227
-
1228
- supervisions.append(
1229
- Supervision(
1230
- text=text,
1231
- start=start,
1232
- duration=duration,
1233
- speaker=speaker,
1234
- )
1235
- )
1236
- except (ValueError, IndexError):
1237
- # Skip malformed entries
1238
- continue
1239
-
1240
- return supervisions
1241
-
1242
- @classmethod
1243
- def _write_tsv(
1244
- cls,
1245
- alignments: List[Supervision],
1246
- output_path: Pathlike,
1247
- include_speaker_in_text: bool = True,
1248
- ) -> None:
1249
- """
1250
- Write caption to TSV format.
1251
-
1252
- Format: speaker\tstart\tend\ttext (with speaker)
1253
- or: start\tend\ttext (without speaker)
1254
-
1255
- Args:
1256
- alignments: List of supervision segments to write
1257
- output_path: Path to output TSV file
1258
- include_speaker_in_text: Whether to include speaker column
1259
- """
1260
- with open(output_path, "w", encoding="utf-8") as file:
1261
- # Write header
1262
- if include_speaker_in_text:
1263
- file.write("speaker\tstart\tend\ttext\n")
1264
- for supervision in alignments:
1265
- # Respect `original_speaker` custom flag: default to True when missing
1266
- include_speaker = supervision.speaker and (
1267
- not supervision.has_custom("original_speaker") or supervision.custom["original_speaker"]
1268
- )
1269
- speaker = supervision.speaker if include_speaker else ""
1270
- start_ms = round(1000 * supervision.start)
1271
- end_ms = round(1000 * supervision.end)
1272
- text = supervision.text.strip().replace("\t", " ")
1273
- file.write(f"{speaker}\t{start_ms}\t{end_ms}\t{text}\n")
1274
- else:
1275
- file.write("start\tend\ttext\n")
1276
- for supervision in alignments:
1277
- start_ms = round(1000 * supervision.start)
1278
- end_ms = round(1000 * supervision.end)
1279
- text = supervision.text.strip().replace("\t", " ")
1280
- file.write(f"{start_ms}\t{end_ms}\t{text}\n")
1281
-
1282
- @classmethod
1283
- def _write_csv(
1284
- cls,
1285
- alignments: List[Supervision],
1286
- output_path: Pathlike,
1287
- include_speaker_in_text: bool = True,
1288
- ) -> None:
1289
- """
1290
- Write caption to CSV format.
1291
-
1292
- Format: speaker,start,end,text (with speaker)
1293
- or: start,end,text (without speaker)
1294
-
1295
- Args:
1296
- alignments: List of supervision segments to write
1297
- output_path: Path to output CSV file
1298
- include_speaker_in_text: Whether to include speaker column
1299
- """
1300
- import csv
1301
-
1302
- with open(output_path, "w", encoding="utf-8", newline="") as file:
1303
- if include_speaker_in_text:
1304
- writer = csv.writer(file)
1305
- writer.writerow(["speaker", "start", "end", "text"])
1306
- for supervision in alignments:
1307
- include_speaker = supervision.speaker and (
1308
- not supervision.has_custom("original_speaker") or supervision.custom["original_speaker"]
1309
- )
1310
- speaker = supervision.speaker if include_speaker else ""
1311
- start_ms = round(1000 * supervision.start)
1312
- end_ms = round(1000 * supervision.end)
1313
- text = supervision.text.strip()
1314
- writer.writerow([speaker, start_ms, end_ms, text])
1315
- else:
1316
- writer = csv.writer(file)
1317
- writer.writerow(["start", "end", "text"])
1318
- for supervision in alignments:
1319
- start_ms = round(1000 * supervision.start)
1320
- end_ms = round(1000 * supervision.end)
1321
- text = supervision.text.strip()
1322
- writer.writerow([start_ms, end_ms, text])
1323
-
1324
- @classmethod
1325
- def _write_aud(
1326
- cls,
1327
- alignments: List[Supervision],
1328
- output_path: Pathlike,
1329
- include_speaker_in_text: bool = True,
1330
- ) -> None:
1331
- """
1332
- Write caption to AUD format.
1333
-
1334
- Format: start\tend\t[[speaker]]text
1335
- or: start\tend\ttext (without speaker)
1336
-
1337
- Args:
1338
- alignments: List of supervision segments to write
1339
- output_path: Path to output AUD file
1340
- include_speaker_in_text: Whether to include speaker in [[brackets]]
1341
- """
1342
- with open(output_path, "w", encoding="utf-8") as file:
1343
- for supervision in alignments:
1344
- start = supervision.start
1345
- end = supervision.end
1346
- text = supervision.text.strip().replace("\t", " ")
1347
-
1348
- # Respect `original_speaker` custom flag when adding speaker prefix
1349
- if (
1350
- include_speaker_in_text
1351
- and supervision.speaker
1352
- and (not supervision.has_custom("original_speaker") or supervision.custom["original_speaker"])
1353
- ):
1354
- text = f"[[{supervision.speaker}]]{text}"
1355
-
1356
- file.write(f"{start}\t{end}\t{text}\n")
1357
-
1358
- @classmethod
1359
- def _write_sbv(
1360
- cls,
1361
- alignments: List[Supervision],
1362
- output_path: Pathlike,
1363
- include_speaker_in_text: bool = True,
1364
- ) -> None:
1365
- """
1366
- Write caption to SubViewer (SBV) format.
1367
-
1368
- Format:
1369
- 0:00:00.000,0:00:02.000
1370
- Text line 1
1371
-
1372
- 0:00:02.000,0:00:04.000
1373
- Text line 2
1374
-
1375
- Args:
1376
- alignments: List of supervision segments to write
1377
- output_path: Path to output SBV file
1378
- include_speaker_in_text: Whether to include speaker in text
1379
- """
1380
- with open(output_path, "w", encoding="utf-8") as file:
1381
- for i, supervision in enumerate(alignments):
1382
- # Format timestamps as H:MM:SS.mmm
1383
- start_h = int(supervision.start // 3600)
1384
- start_m = int((supervision.start % 3600) // 60)
1385
- start_s = int(supervision.start % 60)
1386
- start_ms = int((supervision.start % 1) * 1000)
1387
-
1388
- end_h = int(supervision.end // 3600)
1389
- end_m = int((supervision.end % 3600) // 60)
1390
- end_s = int(supervision.end % 60)
1391
- end_ms = int((supervision.end % 1) * 1000)
1392
-
1393
- start_time = f"{start_h}:{start_m:02d}:{start_s:02d}.{start_ms:03d}"
1394
- end_time = f"{end_h}:{end_m:02d}:{end_s:02d}.{end_ms:03d}"
1395
-
1396
- # Write timestamp line
1397
- file.write(f"{start_time},{end_time}\n")
1398
-
1399
- # Write text (with optional speaker). Respect `original_speaker` custom flag.
1400
- text = supervision.text.strip()
1401
- if (
1402
- include_speaker_in_text
1403
- and supervision.speaker
1404
- and (not supervision.has_custom("original_speaker") or supervision.custom["original_speaker"])
1405
- ):
1406
- text = f"{supervision.speaker}: {text}"
1407
-
1408
- file.write(f"{text}\n")
1409
-
1410
- # Add blank line between entries (except after last one)
1411
- if i < len(alignments) - 1:
1412
- file.write("\n")
1413
-
1414
- @classmethod
1415
- def _parse_caption(
1416
- cls, caption: Pathlike, format: Optional[OutputCaptionFormat], normalize_text: Optional[bool] = False
1417
- ) -> List[Supervision]:
1418
- """
1419
- Parse caption using pysubs2.
1420
-
1421
- Args:
1422
- caption: Caption file path or content
1423
- format: Caption format
1424
- normalize_text: Whether to normalize text
1425
-
1426
- Returns:
1427
- List of Supervision objects
1428
- """
1429
- import pysubs2
1430
-
1431
- try:
1432
- subs: pysubs2.SSAFile = pysubs2.load(
1433
- caption, encoding="utf-8", format_=format if format != "auto" else None
1434
- ) # file
1435
- except IOError:
1436
- try:
1437
- subs: pysubs2.SSAFile = pysubs2.SSAFile.from_string(
1438
- caption, format_=format if format != "auto" else None
1439
- ) # str
1440
- except Exception as e:
1441
- del e
1442
- subs: pysubs2.SSAFile = pysubs2.load(caption, encoding="utf-8") # auto detect format
1443
-
1444
- # Parse supervisions
1445
- supervisions = []
1446
- for event in subs.events:
1447
- if normalize_text:
1448
- event.text = normalize_text_fn(event.text)
1449
- speaker, text = parse_speaker_text(event.text)
1450
- supervisions.append(
1451
- Supervision(
1452
- text=text,
1453
- speaker=speaker or event.name,
1454
- start=event.start / 1000.0 if event.start is not None else None,
1455
- duration=(event.end - event.start) / 1000.0 if event.end is not None else None,
1456
- )
1457
- )
1458
- return supervisions
1459
-
1460
652
  def __repr__(self) -> str:
1461
653
  """String representation of Caption."""
1462
654
  lang = f"lang={self.language}" if self.language else "lang=unknown"