lattifai 1.2.2__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. lattifai/_init.py +20 -0
  2. lattifai/alignment/__init__.py +2 -3
  3. lattifai/alignment/lattice1_aligner.py +117 -4
  4. lattifai/alignment/lattice1_worker.py +47 -4
  5. lattifai/alignment/segmenter.py +3 -2
  6. lattifai/alignment/text_align.py +2 -1
  7. lattifai/alignment/tokenizer.py +56 -29
  8. lattifai/audio2.py +162 -183
  9. lattifai/cli/alignment.py +5 -0
  10. lattifai/cli/caption.py +6 -6
  11. lattifai/cli/transcribe.py +1 -5
  12. lattifai/cli/youtube.py +3 -0
  13. lattifai/client.py +41 -12
  14. lattifai/config/__init__.py +21 -3
  15. lattifai/config/alignment.py +7 -0
  16. lattifai/config/caption.py +13 -243
  17. lattifai/config/client.py +16 -0
  18. lattifai/config/event.py +102 -0
  19. lattifai/config/transcription.py +25 -1
  20. lattifai/data/__init__.py +8 -0
  21. lattifai/data/caption.py +228 -0
  22. lattifai/errors.py +78 -53
  23. lattifai/event/__init__.py +65 -0
  24. lattifai/event/lattifai.py +166 -0
  25. lattifai/mixin.py +22 -17
  26. lattifai/transcription/base.py +2 -1
  27. lattifai/transcription/gemini.py +147 -16
  28. lattifai/transcription/lattifai.py +8 -11
  29. lattifai/types.py +1 -1
  30. lattifai/youtube/client.py +143 -48
  31. {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/METADATA +129 -58
  32. lattifai-1.3.1.dist-info/RECORD +57 -0
  33. lattifai/__init__.py +0 -88
  34. lattifai/alignment/sentence_splitter.py +0 -350
  35. lattifai/caption/__init__.py +0 -96
  36. lattifai/caption/caption.py +0 -661
  37. lattifai/caption/formats/__init__.py +0 -199
  38. lattifai/caption/formats/base.py +0 -211
  39. lattifai/caption/formats/gemini.py +0 -722
  40. lattifai/caption/formats/json.py +0 -194
  41. lattifai/caption/formats/lrc.py +0 -309
  42. lattifai/caption/formats/nle/__init__.py +0 -9
  43. lattifai/caption/formats/nle/audition.py +0 -561
  44. lattifai/caption/formats/nle/avid.py +0 -423
  45. lattifai/caption/formats/nle/fcpxml.py +0 -549
  46. lattifai/caption/formats/nle/premiere.py +0 -589
  47. lattifai/caption/formats/pysubs2.py +0 -642
  48. lattifai/caption/formats/sbv.py +0 -147
  49. lattifai/caption/formats/tabular.py +0 -338
  50. lattifai/caption/formats/textgrid.py +0 -193
  51. lattifai/caption/formats/ttml.py +0 -652
  52. lattifai/caption/formats/vtt.py +0 -469
  53. lattifai/caption/parsers/__init__.py +0 -9
  54. lattifai/caption/parsers/text_parser.py +0 -147
  55. lattifai/caption/standardize.py +0 -636
  56. lattifai/caption/supervision.py +0 -34
  57. lattifai/caption/utils.py +0 -474
  58. lattifai-1.2.2.dist-info/RECORD +0 -76
  59. {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/WHEEL +0 -0
  60. {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/entry_points.txt +0 -0
  61. {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/licenses/LICENSE +0 -0
  62. {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/top_level.txt +0 -0
@@ -1,661 +0,0 @@
1
- """Caption data structure for storing subtitle information with metadata."""
2
-
3
- from __future__ import annotations
4
-
5
- import io
6
- from dataclasses import dataclass, field
7
- from pathlib import Path
8
- from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeVar, Union
9
-
10
- if TYPE_CHECKING:
11
- from ..config.caption import KaraokeConfig
12
-
13
- from lhotse.supervision import AlignmentItem
14
- from lhotse.utils import Pathlike
15
- from tgt import TextGrid
16
-
17
- from ..config.caption import InputCaptionFormat, OutputCaptionFormat # noqa: F401
18
- from .formats import detect_format, get_reader, get_writer
19
- from .supervision import Supervision
20
-
21
- DiarizationOutput = TypeVar("DiarizationOutput")
22
-
23
-
24
- @dataclass
25
- class Caption:
26
- """
27
- Container for caption/subtitle data with metadata.
28
-
29
- This class encapsulates a list of supervisions (subtitle segments) along with
30
- metadata such as language, kind, format information, and source file details.
31
-
32
- Attributes:
33
- supervisions: List of supervision segments containing text and timing information
34
- language: Language code (e.g., 'en', 'zh', 'es')
35
- kind: Caption kind/type (e.g., 'captions', 'subtitles', 'descriptions')
36
- source_format: Original format of the caption file (e.g., 'vtt', 'srt', 'json')
37
- source_path: Path to the source caption file
38
- metadata: Additional custom metadata as key-value pairs
39
- """
40
-
41
- # read from subtitle file
42
- supervisions: List[Supervision] = field(default_factory=list)
43
- # Transcription results
44
- transcription: List[Supervision] = field(default_factory=list)
45
- # Audio Event Detection results
46
- audio_events: Optional[TextGrid] = None
47
- # Speaker Diarization results
48
- speaker_diarization: Optional[DiarizationOutput] = None
49
- # Alignment results
50
- alignments: List[Supervision] = field(default_factory=list)
51
-
52
- language: Optional[str] = None
53
- kind: Optional[str] = None
54
- source_format: Optional[str] = None
55
- source_path: Optional[Pathlike] = None
56
- metadata: Dict[str, Any] = field(default_factory=dict)
57
-
58
- def __len__(self) -> int:
59
- """Return the number of supervision segments."""
60
- return len(self.supervisions or self.transcription)
61
-
62
- def __iter__(self):
63
- """Iterate over supervision segments."""
64
- return iter(self.supervisions)
65
-
66
- def __getitem__(self, index):
67
- """Get supervision segment by index."""
68
- return self.supervisions[index]
69
-
70
- def __bool__(self) -> bool:
71
- """Return True if caption has supervisions."""
72
- return len(self) > 0
73
-
74
- @property
75
- def is_empty(self) -> bool:
76
- """Check if caption has no supervisions."""
77
- return len(self.supervisions) == 0
78
-
79
- @property
80
- def duration(self) -> Optional[float]:
81
- """
82
- Get total duration of the caption in seconds.
83
-
84
- Returns:
85
- Total duration from first to last supervision, or None if empty
86
- """
87
- if not self.supervisions:
88
- return None
89
- return self.supervisions[-1].end - self.supervisions[0].start
90
-
91
- @property
92
- def start_time(self) -> Optional[float]:
93
- """Get start time of first supervision."""
94
- if not self.supervisions:
95
- return None
96
- return self.supervisions[0].start
97
-
98
- @property
99
- def end_time(self) -> Optional[float]:
100
- """Get end time of last supervision."""
101
- if not self.supervisions:
102
- return None
103
- return self.supervisions[-1].end
104
-
105
- def append(self, supervision: Supervision) -> None:
106
- """Add a supervision segment to the caption."""
107
- self.supervisions.append(supervision)
108
-
109
- def extend(self, supervisions: List[Supervision]) -> None:
110
- """Add multiple supervision segments to the caption."""
111
- self.supervisions.extend(supervisions)
112
-
113
- def filter_by_speaker(self, speaker: str) -> "Caption":
114
- """
115
- Create a new Caption with only supervisions from a specific speaker.
116
-
117
- Args:
118
- speaker: Speaker identifier to filter by
119
-
120
- Returns:
121
- New Caption instance with filtered supervisions
122
- """
123
- filtered_sups = [sup for sup in self.supervisions if sup.speaker == speaker]
124
- return Caption(
125
- supervisions=filtered_sups,
126
- language=self.language,
127
- kind=self.kind,
128
- source_format=self.source_format,
129
- source_path=self.source_path,
130
- metadata=self.metadata.copy(),
131
- )
132
-
133
- def get_speakers(self) -> List[str]:
134
- """
135
- Get list of unique speakers in the caption.
136
-
137
- Returns:
138
- Sorted list of unique speaker identifiers
139
- """
140
- speakers = {sup.speaker for sup in self.supervisions if sup.speaker}
141
- return sorted(speakers)
142
-
143
- def shift_time(self, seconds: float) -> "Caption":
144
- """
145
- Create a new Caption with all timestamps shifted by given seconds.
146
-
147
- Args:
148
- seconds: Number of seconds to shift (positive delays, negative advances)
149
-
150
- Returns:
151
- New Caption instance with shifted timestamps
152
- """
153
- shifted_sups = []
154
- for sup in self.supervisions:
155
- # Calculate physical time range
156
- raw_start = sup.start + seconds
157
- raw_end = sup.end + seconds
158
-
159
- # Skip segments that end before 0
160
- if raw_end <= 0:
161
- continue
162
-
163
- # Clip start to 0 if negative
164
- if raw_start < 0:
165
- final_start = 0.0
166
- final_duration = raw_end
167
- else:
168
- final_start = raw_start
169
- final_duration = sup.duration
170
-
171
- # Handle alignment (word-level timestamps)
172
- final_alignment = None
173
- original_alignment = getattr(sup, "alignment", None)
174
- if original_alignment and "word" in original_alignment:
175
- new_words = []
176
- for word in original_alignment["word"]:
177
- w_start = word.start + seconds
178
- w_end = w_start + word.duration
179
-
180
- # Skip words that end before 0
181
- if w_end <= 0:
182
- continue
183
-
184
- # Clip start to 0 if negative
185
- if w_start < 0:
186
- w_final_start = 0.0
187
- w_final_duration = w_end
188
- else:
189
- w_final_start = w_start
190
- w_final_duration = word.duration
191
-
192
- new_words.append(
193
- AlignmentItem(
194
- symbol=word.symbol,
195
- start=w_final_start,
196
- duration=w_final_duration,
197
- score=word.score,
198
- )
199
- )
200
-
201
- # Copy original alignment dict structure and update words
202
- final_alignment = original_alignment.copy()
203
- final_alignment["word"] = new_words
204
-
205
- shifted_sups.append(
206
- Supervision(
207
- text=sup.text,
208
- start=final_start,
209
- duration=final_duration,
210
- speaker=sup.speaker,
211
- id=sup.id,
212
- recording_id=sup.recording_id if hasattr(sup, "recording_id") else "",
213
- channel=getattr(sup, "channel", 0),
214
- language=sup.language,
215
- alignment=final_alignment,
216
- custom=sup.custom,
217
- )
218
- )
219
-
220
- return Caption(
221
- supervisions=shifted_sups,
222
- language=self.language,
223
- kind=self.kind,
224
- source_format=self.source_format,
225
- source_path=self.source_path,
226
- metadata=self.metadata.copy(),
227
- )
228
-
229
- def with_margins(
230
- self,
231
- start_margin: float = 0.08,
232
- end_margin: float = 0.20,
233
- min_gap: float = 0.08,
234
- collision_mode: str = "trim",
235
- ) -> "Caption":
236
- """
237
- Create a new Caption with segment boundaries adjusted based on word-level alignment.
238
-
239
- Uses supervision.alignment['word'] to recalculate segment start/end times
240
- with the specified margins applied around the actual speech boundaries.
241
-
242
- Args:
243
- start_margin: Seconds to extend before the first word (default: 0.08)
244
- end_margin: Seconds to extend after the last word (default: 0.20)
245
- min_gap: Minimum gap between segments for collision handling (default: 0.08)
246
- collision_mode: How to handle segment overlap - 'trim' or 'gap' (default: 'trim')
247
-
248
- Returns:
249
- New Caption instance with adjusted timestamps
250
-
251
- Note:
252
- Segments without alignment data will keep their original timestamps.
253
-
254
- Example:
255
- >>> caption = Caption.read("aligned.srt")
256
- >>> adjusted = caption.with_margins(start_margin=0.05, end_margin=0.15)
257
- >>> adjusted.write("output.srt")
258
- """
259
- from .standardize import apply_margins_to_captions
260
-
261
- # Determine which supervisions to use
262
- if self.alignments:
263
- source_sups = self.alignments
264
- elif self.supervisions:
265
- source_sups = self.supervisions
266
- else:
267
- source_sups = self.transcription
268
-
269
- adjusted_sups = apply_margins_to_captions(
270
- source_sups,
271
- start_margin=start_margin,
272
- end_margin=end_margin,
273
- min_gap=min_gap,
274
- collision_mode=collision_mode,
275
- )
276
-
277
- return Caption(
278
- supervisions=adjusted_sups,
279
- transcription=self.transcription,
280
- audio_events=self.audio_events,
281
- speaker_diarization=self.speaker_diarization,
282
- alignments=[], # Clear alignments since we've applied them
283
- language=self.language,
284
- kind=self.kind,
285
- source_format=self.source_format,
286
- source_path=self.source_path,
287
- metadata=self.metadata.copy(),
288
- )
289
-
290
- def to_string(
291
- self,
292
- format: str = "srt",
293
- word_level: bool = False,
294
- karaoke_config: Optional["KaraokeConfig"] = None,
295
- metadata: Optional[Dict[str, Any]] = None,
296
- ) -> str:
297
- """
298
- Return caption content in specified format.
299
-
300
- Args:
301
- format: Output format (e.g., 'srt', 'vtt', 'ass')
302
- word_level: Use word-level output format if supported (e.g., LRC, ASS, TTML)
303
- karaoke_config: Karaoke configuration. When provided with enabled=True,
304
- enables karaoke styling (ASS \\kf tags, enhanced LRC, etc.)
305
- metadata: Optional metadata dict to pass to writer. If None, uses self.metadata.
306
-
307
- Returns:
308
- String containing formatted captions
309
- """
310
- return self.to_bytes(
311
- output_format=format, word_level=word_level, karaoke_config=karaoke_config, metadata=metadata
312
- ).decode("utf-8")
313
-
314
- def to_dict(self) -> Dict:
315
- """
316
- Convert Caption to dictionary representation.
317
-
318
- Returns:
319
- Dictionary with caption data and metadata
320
- """
321
- return {
322
- "supervisions": [sup.to_dict() for sup in self.supervisions],
323
- "language": self.language,
324
- "kind": self.kind,
325
- "source_format": self.source_format,
326
- "source_path": str(self.source_path) if self.source_path else None,
327
- "metadata": self.metadata,
328
- "duration": self.duration,
329
- "num_segments": len(self.supervisions),
330
- "speakers": self.get_speakers(),
331
- }
332
-
333
- @classmethod
334
- def from_supervisions(
335
- cls,
336
- supervisions: List[Supervision],
337
- language: Optional[str] = None,
338
- kind: Optional[str] = None,
339
- source_format: Optional[str] = None,
340
- source_path: Optional[Pathlike] = None,
341
- metadata: Optional[Dict[str, str]] = None,
342
- ) -> "Caption":
343
- """
344
- Create Caption from a list of supervisions.
345
-
346
- Args:
347
- supervisions: List of supervision segments
348
- language: Language code
349
- kind: Caption kind/type
350
- source_format: Original format
351
- source_path: Source file path
352
- metadata: Additional metadata
353
-
354
- Returns:
355
- New Caption instance
356
- """
357
- return cls(
358
- supervisions=supervisions,
359
- language=language,
360
- kind=kind,
361
- source_format=source_format,
362
- source_path=source_path,
363
- metadata=metadata or {},
364
- )
365
-
366
- @classmethod
367
- def from_string(
368
- cls,
369
- content: str,
370
- format: str,
371
- normalize_text: bool = True,
372
- ) -> "Caption":
373
- """
374
- Create Caption from string content.
375
-
376
- Args:
377
- content: Caption content as string
378
- format: Caption format (e.g., 'srt', 'vtt', 'ass')
379
- normalize_text: Whether to normalize text during reading
380
-
381
- Returns:
382
- New Caption instance
383
-
384
- Example:
385
- >>> srt_content = \"\"\"1
386
- ... 00:00:00,000 --> 00:00:02,000
387
- ... Hello world\"\"\"
388
- >>> caption = Caption.from_string(srt_content, format=\"srt\")
389
- """
390
- buffer = io.StringIO(content)
391
- return cls.read(buffer, format=format, normalize_text=normalize_text)
392
-
393
- def to_bytes(
394
- self,
395
- output_format: Optional[str] = None,
396
- include_speaker_in_text: bool = True,
397
- word_level: bool = False,
398
- karaoke_config: Optional["KaraokeConfig"] = None,
399
- metadata: Optional[Dict[str, Any]] = None,
400
- ) -> bytes:
401
- """
402
- Convert caption to bytes.
403
-
404
- Args:
405
- output_format: Output format (e.g., 'srt', 'vtt', 'ass'). Defaults to source_format or 'srt'
406
- include_speaker_in_text: Whether to include speaker labels in text
407
- word_level: Use word-level output format if supported (e.g., LRC, ASS, TTML)
408
- karaoke_config: Karaoke configuration. When provided with enabled=True,
409
- enables karaoke styling (ASS \\kf tags, enhanced LRC, etc.)
410
- metadata: Optional metadata dict to pass to writer. If None, uses self.metadata.
411
-
412
- Returns:
413
- Caption content as bytes
414
-
415
- Example:
416
- >>> caption = Caption.read("input.srt")
417
- >>> # Get as bytes in original format
418
- >>> data = caption.to_bytes()
419
- >>> # Get as bytes in specific format
420
- >>> vtt_data = caption.to_bytes(output_format="vtt")
421
- """
422
- return self.write(
423
- None,
424
- output_format=output_format,
425
- include_speaker_in_text=include_speaker_in_text,
426
- word_level=word_level,
427
- karaoke_config=karaoke_config,
428
- metadata=metadata,
429
- )
430
-
431
- @classmethod
432
- def from_transcription_results(
433
- cls,
434
- transcription: List[Supervision],
435
- audio_events: Optional[TextGrid] = None,
436
- speaker_diarization: Optional[DiarizationOutput] = None,
437
- language: Optional[str] = None,
438
- source_path: Optional[Pathlike] = None,
439
- metadata: Optional[Dict[str, str]] = None,
440
- ) -> "Caption":
441
- """
442
- Create Caption from transcription results including audio events and diarization.
443
-
444
- Args:
445
- transcription: List of transcription supervision segments
446
- audio_events: Optional TextGrid with audio event detection results
447
- speaker_diarization: Optional DiarizationOutput with speaker diarization results
448
- language: Language code
449
- source_path: Source file path
450
- metadata: Additional metadata
451
-
452
- Returns:
453
- New Caption instance with transcription data
454
- """
455
- return cls(
456
- transcription=transcription,
457
- audio_events=audio_events,
458
- speaker_diarization=speaker_diarization,
459
- language=language,
460
- kind="transcription",
461
- source_format="asr",
462
- source_path=source_path,
463
- metadata=metadata or {},
464
- )
465
-
466
- @classmethod
467
- def read(
468
- cls,
469
- path: Union[Pathlike, io.BytesIO, io.StringIO],
470
- format: Optional[str] = None,
471
- normalize_text: bool = True,
472
- ) -> "Caption":
473
- """
474
- Read caption file or in-memory data and return Caption object.
475
-
476
- Args:
477
- path: Path to caption file, or BytesIO/StringIO object with caption content
478
- format: Caption format (auto-detected if not provided, required for in-memory data)
479
- normalize_text: Whether to normalize text during reading
480
-
481
- Returns:
482
- Caption object containing supervisions and metadata
483
- """
484
- # Detect format if not provided
485
- if not format:
486
- if isinstance(path, (io.BytesIO, io.StringIO)):
487
- raise ValueError("format parameter is required when reading from BytesIO/StringIO")
488
- format = detect_format(str(path))
489
-
490
- if not format:
491
- # Fallback to extension
492
- if not isinstance(path, (io.BytesIO, io.StringIO)):
493
- format = Path(str(path)).suffix.lstrip(".").lower()
494
-
495
- if not format:
496
- format = "srt" # Last resort default
497
-
498
- # Get content if it's an in-memory buffer
499
- source = path
500
- if isinstance(path, io.BytesIO):
501
- source = path.read().decode("utf-8")
502
- elif isinstance(path, io.StringIO):
503
- source = path.read()
504
-
505
- # Reset buffer position if it was a stream
506
- if isinstance(path, (io.BytesIO, io.StringIO)):
507
- path.seek(0)
508
-
509
- # Get reader and perform extraction
510
- reader_cls = get_reader(format)
511
- if not reader_cls:
512
- # Use pysubs2 as a generic fallback if no specific reader exists
513
- from .formats.pysubs2 import Pysubs2Format
514
-
515
- reader_cls = Pysubs2Format
516
-
517
- supervisions = reader_cls.read(source, normalize_text=normalize_text)
518
- metadata = reader_cls.extract_metadata(source)
519
-
520
- # Create Caption object
521
- source_path = None
522
- if isinstance(path, (str, Path)) and not ("\n" in str(path) or len(str(path)) > 500):
523
- try:
524
- p = Path(str(path))
525
- if p.exists():
526
- source_path = str(p)
527
- except (OSError, ValueError):
528
- pass
529
-
530
- return cls(
531
- supervisions=supervisions,
532
- language=metadata.get("language"),
533
- kind=metadata.get("kind"),
534
- source_format=format,
535
- source_path=source_path,
536
- metadata=metadata,
537
- )
538
-
539
- def write(
540
- self,
541
- path: Union[Pathlike, io.BytesIO, None] = None,
542
- output_format: Optional[str] = None,
543
- include_speaker_in_text: bool = True,
544
- word_level: bool = False,
545
- karaoke_config: Optional["KaraokeConfig"] = None,
546
- metadata: Optional[Dict[str, Any]] = None,
547
- ) -> Union[Pathlike, bytes]:
548
- """
549
- Write caption to file or return as bytes.
550
-
551
- Args:
552
- path: Path to output caption file, BytesIO object, or None to return bytes
553
- output_format: Output format (e.g., 'srt', 'vtt', 'ass')
554
- include_speaker_in_text: Whether to include speaker labels in text
555
- word_level: Use word-level output format if supported (e.g., LRC, ASS, TTML)
556
- karaoke_config: Karaoke configuration. When provided with enabled=True,
557
- enables karaoke styling (ASS \\kf tags, enhanced LRC, etc.)
558
- metadata: Optional metadata dict to pass to writer. If None, uses self.metadata.
559
- Can be used to override or supplement format-specific metadata.
560
-
561
- Returns:
562
- Path to the written file if path is a file path, or bytes if path is BytesIO/None
563
- """
564
- if self.alignments:
565
- supervisions = self.alignments
566
- elif self.supervisions:
567
- supervisions = self.supervisions
568
- else:
569
- supervisions = self.transcription
570
-
571
- # Merge external metadata with self.metadata (external takes precedence)
572
- effective_metadata = dict(self.metadata) if self.metadata else {}
573
- if metadata:
574
- effective_metadata.update(metadata)
575
-
576
- # Determine output format
577
- if output_format:
578
- output_format = output_format.lower()
579
- elif isinstance(path, (io.BytesIO, type(None))):
580
- output_format = self.source_format or "srt"
581
- else:
582
- output_format = detect_format(str(path)) or Path(str(path)).suffix.lstrip(".").lower() or "srt"
583
-
584
- # Special casing for professional formats as before
585
- ext = output_format
586
- if isinstance(path, (str, Path)):
587
- path_str = str(path)
588
- if path_str.endswith("_avid.txt"):
589
- ext = "avid_ds"
590
- elif "audition" in path_str.lower() and path_str.endswith(".csv"):
591
- ext = "audition_csv"
592
- elif "edimarker" in path_str.lower() and path_str.endswith(".csv"):
593
- ext = "edimarker_csv"
594
- elif "imsc" in path_str.lower() and path_str.endswith(".ttml"):
595
- ext = "imsc1"
596
- elif "ebu" in path_str.lower() and path_str.endswith(".ttml"):
597
- ext = "ebu_tt_d"
598
-
599
- writer_cls = get_writer(ext)
600
- if not writer_cls:
601
- from .formats.pysubs2 import Pysubs2Format
602
-
603
- writer_cls = Pysubs2Format
604
-
605
- if isinstance(path, (str, Path)):
606
- return writer_cls.write(
607
- supervisions,
608
- path,
609
- include_speaker=include_speaker_in_text,
610
- word_level=word_level,
611
- karaoke_config=karaoke_config,
612
- metadata=effective_metadata,
613
- )
614
-
615
- content = writer_cls.to_bytes(
616
- supervisions,
617
- include_speaker=include_speaker_in_text,
618
- word_level=word_level,
619
- karaoke_config=karaoke_config,
620
- metadata=effective_metadata,
621
- )
622
- if isinstance(path, io.BytesIO):
623
- path.write(content)
624
- path.seek(0)
625
- return content
626
-
627
- def read_speaker_diarization(
628
- self,
629
- path: Pathlike,
630
- ) -> "DiarizationOutput":
631
- """
632
- Read speaker diarization TextGrid from file.
633
- """
634
- from lattifai_core.diarization import DiarizationOutput
635
-
636
- self.speaker_diarization = DiarizationOutput.read(path)
637
- return self.speaker_diarization
638
-
639
- def write_speaker_diarization(
640
- self,
641
- path: Pathlike,
642
- ) -> Pathlike:
643
- """
644
- Write speaker diarization TextGrid to file.
645
- """
646
- if not self.speaker_diarization:
647
- raise ValueError("No speaker diarization data to write.")
648
-
649
- self.speaker_diarization.write(path)
650
- return path
651
-
652
- def __repr__(self) -> str:
653
- """String representation of Caption."""
654
- lang = f"lang={self.language}" if self.language else "lang=unknown"
655
- kind_str = f"kind={self.kind}" if self.kind else ""
656
- parts = [f"Caption({len(self.supervisions or self.transcription)} segments", lang]
657
- if kind_str:
658
- parts.append(kind_str)
659
- if self.duration:
660
- parts.append(f"duration={self.duration:.2f}s")
661
- return ", ".join(parts) + ")"