lattifai 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. lattifai/__init__.py +0 -24
  2. lattifai/alignment/__init__.py +10 -1
  3. lattifai/alignment/lattice1_aligner.py +66 -58
  4. lattifai/alignment/lattice1_worker.py +1 -6
  5. lattifai/alignment/punctuation.py +38 -0
  6. lattifai/alignment/segmenter.py +1 -1
  7. lattifai/alignment/sentence_splitter.py +350 -0
  8. lattifai/alignment/text_align.py +440 -0
  9. lattifai/alignment/tokenizer.py +91 -220
  10. lattifai/caption/__init__.py +82 -6
  11. lattifai/caption/caption.py +335 -1143
  12. lattifai/caption/formats/__init__.py +199 -0
  13. lattifai/caption/formats/base.py +211 -0
  14. lattifai/caption/formats/gemini.py +722 -0
  15. lattifai/caption/formats/json.py +194 -0
  16. lattifai/caption/formats/lrc.py +309 -0
  17. lattifai/caption/formats/nle/__init__.py +9 -0
  18. lattifai/caption/formats/nle/audition.py +561 -0
  19. lattifai/caption/formats/nle/avid.py +423 -0
  20. lattifai/caption/formats/nle/fcpxml.py +549 -0
  21. lattifai/caption/formats/nle/premiere.py +589 -0
  22. lattifai/caption/formats/pysubs2.py +642 -0
  23. lattifai/caption/formats/sbv.py +147 -0
  24. lattifai/caption/formats/tabular.py +338 -0
  25. lattifai/caption/formats/textgrid.py +193 -0
  26. lattifai/caption/formats/ttml.py +652 -0
  27. lattifai/caption/formats/vtt.py +469 -0
  28. lattifai/caption/parsers/__init__.py +9 -0
  29. lattifai/caption/{text_parser.py → parsers/text_parser.py} +4 -2
  30. lattifai/caption/standardize.py +636 -0
  31. lattifai/caption/utils.py +474 -0
  32. lattifai/cli/__init__.py +2 -1
  33. lattifai/cli/caption.py +108 -1
  34. lattifai/cli/transcribe.py +4 -9
  35. lattifai/cli/youtube.py +4 -1
  36. lattifai/client.py +48 -84
  37. lattifai/config/__init__.py +11 -1
  38. lattifai/config/alignment.py +9 -2
  39. lattifai/config/caption.py +267 -23
  40. lattifai/config/media.py +20 -0
  41. lattifai/diarization/__init__.py +41 -1
  42. lattifai/mixin.py +36 -18
  43. lattifai/transcription/base.py +6 -1
  44. lattifai/transcription/lattifai.py +19 -54
  45. lattifai/utils.py +81 -13
  46. lattifai/workflow/__init__.py +28 -4
  47. lattifai/workflow/file_manager.py +2 -5
  48. lattifai/youtube/__init__.py +43 -0
  49. lattifai/youtube/client.py +1170 -0
  50. lattifai/youtube/types.py +23 -0
  51. lattifai-1.2.2.dist-info/METADATA +615 -0
  52. lattifai-1.2.2.dist-info/RECORD +76 -0
  53. {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/entry_points.txt +1 -2
  54. lattifai/caption/gemini_reader.py +0 -371
  55. lattifai/caption/gemini_writer.py +0 -173
  56. lattifai/cli/app_installer.py +0 -142
  57. lattifai/cli/server.py +0 -44
  58. lattifai/server/app.py +0 -427
  59. lattifai/workflow/youtube.py +0 -577
  60. lattifai-1.2.0.dist-info/METADATA +0 -1133
  61. lattifai-1.2.0.dist-info/RECORD +0 -57
  62. {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/WHEEL +0 -0
  63. {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/licenses/LICENSE +0 -0
  64. {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/top_level.txt +0 -0
lattifai/client.py CHANGED
@@ -7,7 +7,7 @@ import colorful
7
7
  from lattifai_core.client import SyncAPIClient
8
8
  from lhotse.utils import Pathlike
9
9
 
10
- from lattifai.alignment import Lattice1Aligner, Segmenter
10
+ from lattifai.alignment import Lattice1Aligner, Segmenter, align_supervisions_and_transcription
11
11
  from lattifai.audio2 import AudioData, AudioLoader
12
12
  from lattifai.caption import Caption, InputCaptionFormat
13
13
  from lattifai.config import AlignmentConfig, CaptionConfig, ClientConfig, DiarizationConfig, TranscriptionConfig
@@ -123,86 +123,46 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
123
123
  alignment_strategy = self.aligner.config.strategy
124
124
 
125
125
  if alignment_strategy != "entire" or caption.transcription:
126
- safe_print(colorful.cyan(f"🔄 Using segmented alignment strategy: {alignment_strategy}"))
126
+ safe_print(colorful.cyan(f"🔄 Using segmented alignment strategy: {alignment_strategy}"))
127
127
 
128
128
  if caption.supervisions and alignment_strategy == "transcription":
129
- # raise NotImplementedError("Transcription-based alignment is not yet implemented.")
130
- assert (
131
- "gemini" not in self.transcriber.name.lower()
132
- ), "Transcription-based alignment is not supported with Gemini transcriber."
133
- assert (
134
- caption.supervisions
135
- ), "Input caption should contain supervisions when using transcription-based alignment."
129
+ if "gemini" in self.transcriber.name.lower():
130
+ raise ValueError(
131
+ f"Transcription-based alignment is not supported for {self.transcriber.name} "
132
+ "(Gemini's timestamp is not reliable)."
133
+ )
136
134
  if not caption.transcription:
137
- import asyncio
138
-
139
- safe_print(colorful.cyan("📝 Transcribing media for alignment..."))
140
- if output_caption_path:
141
- transcript_file = (
142
- Path(str(output_caption_path)).parent
143
- / f"{Path(str(media_audio)).stem}_{self.transcriber.file_name}"
144
- )
145
- if transcript_file.exists():
146
- # print(colorful.cyan(f"Reading existing transcription from {transcript_file}"))
147
- transcript = self._read_caption(transcript_file, verbose=False)
148
- caption.transcription = transcript.supervisions
149
- caption.audio_events = transcript.audio_events
150
-
151
- if not caption.transcription:
152
- transcript = asyncio.run(
153
- self.transcriber.transcribe(media_audio, language=self.caption_config.source_lang)
154
- )
155
- caption.transcription = transcript.transcription
156
- caption.audio_events = transcript.audio_events
157
-
158
- # Align caption.supervisions with transcription to get segments
159
- import regex
160
- from error_align import ErrorAlign, error_align # noqa: F401
161
- from error_align.utils import DELIMITERS, NUMERIC_TOKEN, STANDARD_TOKEN, OpType
162
-
163
- JOIN_TOKEN = "❄"
164
- if JOIN_TOKEN not in DELIMITERS:
165
- DELIMITERS.add(JOIN_TOKEN)
166
-
167
- def custom_tokenizer(text: str) -> list:
168
- """Default tokenizer that splits text into words based on whitespace.
169
-
170
- Args:
171
- text (str): The input text to tokenize.
172
-
173
- Returns:
174
- list: A list of tokens (words).
175
-
176
- """
177
- # Escape JOIN_TOKEN for use in regex pattern
178
- escaped_join_token = regex.escape(JOIN_TOKEN)
179
- return list(
180
- regex.finditer(
181
- rf"({NUMERIC_TOKEN})|({STANDARD_TOKEN}|{escaped_join_token})",
182
- text,
183
- regex.UNICODE | regex.VERBOSE,
184
- )
135
+ transcript = self._transcribe(
136
+ media_audio,
137
+ source_lang=self.caption_config.source_lang,
138
+ is_async=False,
139
+ output_dir=Path(str(output_caption_path)).parent if output_caption_path else None,
185
140
  )
141
+ caption.transcription = transcript.supervisions or transcript.transcription
142
+ caption.audio_events = transcript.audio_events
143
+ if not caption.transcription:
144
+ raise ValueError("Transcription is empty after transcription step.")
186
145
 
187
- alignments = error_align(
188
- f"{JOIN_TOKEN}".join(sup.text for sup in caption.supervisions),
189
- f"{JOIN_TOKEN}".join(sup.text for sup in caption.transcription),
190
- tokenizer=custom_tokenizer,
191
- )
192
-
193
- for align in alignments:
194
- if align.hyp == JOIN_TOKEN and align.op_type == OpType.MATCH:
195
- pass
146
+ if split_sentence or self.caption_config.split_sentence:
147
+ caption.supervisions = self.aligner.tokenizer.split_sentences(caption.supervisions)
196
148
 
197
- # if align.op_type == OpType.MATCH:
198
- # continue
199
- # elif align.op_type in (OpType.INSERT, OpType.DELETE, OpType.SUBSTITUTE):
200
- # # print(colorful.yellow(f"⚠️ Alignment warning: {op}"))
201
- # pass
149
+ matches = align_supervisions_and_transcription(
150
+ caption, max_duration=media_audio.duration, verbose=True
151
+ )
202
152
 
203
- raise NotImplementedError("Transcription-based segmentation is not yet implemented.")
153
+ skipalign = False
154
+ matches = sorted(matches, key=lambda x: x[2].WER.WER) # sort by WER
155
+ segments = [(m[3].start[1], m[3].end[1], m, skipalign) for m in matches]
156
+ for segment in segments:
157
+ # transcription segments -> sentence splitting
158
+ segment[2][1] = self.aligner.tokenizer.split_sentences(segment[2][1])
204
159
  else:
205
160
  if caption.transcription:
161
+ if "gemini" in self.transcriber.name.lower():
162
+ raise ValueError(
163
+ f"Transcription-based alignment is not supported for {self.transcriber.name} "
164
+ "(Gemini's timestamp is not reliable)."
165
+ )
206
166
  if not caption.supervisions: # youtube + transcription case
207
167
  segments = [(sup.start, sup.end, [sup], not sup.text) for sup in caption.transcription]
208
168
  else:
@@ -220,9 +180,10 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
220
180
  )
221
181
 
222
182
  # align each segment
183
+ sr = media_audio.sampling_rate
223
184
  supervisions, alignments = [], []
224
185
  for i, (start, end, _supervisions, skipalign) in enumerate(segments, 1):
225
- print(
186
+ safe_print(
226
187
  colorful.green(
227
188
  f" ⏩ aligning segment {i:04d}/{len(segments):04d}: {start:8.2f}s - {end:8.2f}s"
228
189
  )
@@ -234,18 +195,15 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
234
195
 
235
196
  offset = round(start, 4)
236
197
  # Extract audio slice
237
- audio_slice_ndarray = media_audio.ndarray[
238
- :, int(start * media_audio.sampling_rate) : int(end * media_audio.sampling_rate)
239
- ]
240
- emission = self.aligner.emission(audio_slice_ndarray)
198
+ audio_slice = media_audio.ndarray[:, int(start * sr) : int(end * sr)]
199
+ emission = self.aligner.emission(audio_slice)
241
200
 
242
201
  # Align segment
243
202
  _supervisions, _alignments = self.aligner.alignment(
244
203
  media_audio,
245
204
  _supervisions,
246
205
  split_sentence=split_sentence or self.caption_config.split_sentence,
247
- return_details=self.caption_config.word_level
248
- or (output_caption_path and str(output_caption_path).endswith(".TextGrid")),
206
+ return_details=True,
249
207
  emission=emission,
250
208
  offset=offset,
251
209
  verbose=False,
@@ -253,14 +211,16 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
253
211
 
254
212
  supervisions.extend(_supervisions)
255
213
  alignments.extend(_alignments)
214
+
215
+ # sort by start
216
+ alignments = sorted(alignments, key=lambda x: x.start)
256
217
  else:
257
218
  # Step 2-4: Standard single-pass alignment
258
219
  supervisions, alignments = self.aligner.alignment(
259
220
  media_audio,
260
221
  caption.supervisions,
261
222
  split_sentence=split_sentence or self.caption_config.split_sentence,
262
- return_details=self.caption_config.word_level
263
- or (output_caption_path and str(output_caption_path).endswith(".TextGrid")),
223
+ return_details=True,
264
224
  )
265
225
 
266
226
  # Update caption with aligned results
@@ -358,6 +318,8 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
358
318
  use_transcription: bool = False,
359
319
  channel_selector: Optional[str | int] = "average",
360
320
  streaming_chunk_secs: Optional[float] = None,
321
+ audio_track_id: Optional[str] = "original",
322
+ quality: str = "best",
361
323
  ) -> Caption:
362
324
  # Prepare output directory and media format
363
325
  output_dir = self._prepare_youtube_output_dir(output_dir)
@@ -366,9 +328,11 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
366
328
  safe_print(colorful.cyan(f"🎬 Starting YouTube workflow for: {url}"))
367
329
 
368
330
  # Step 1: Download media
369
- media_file = self._download_media_sync(url, output_dir, media_format, force_overwrite)
331
+ media_file = self._download_media_sync(url, output_dir, media_format, force_overwrite, audio_track_id, quality)
370
332
 
371
- media_audio = self.audio_loader(media_file, channel_selector=channel_selector)
333
+ media_audio = self.audio_loader(
334
+ media_file, channel_selector=channel_selector, streaming_chunk_secs=streaming_chunk_secs
335
+ )
372
336
 
373
337
  # Step 2: Get or create captions (download or transcribe)
374
338
  caption = self._download_or_transcribe_caption(
@@ -393,7 +357,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
393
357
  output_caption_path=output_caption_path,
394
358
  split_sentence=split_sentence,
395
359
  channel_selector=channel_selector,
396
- streaming_chunk_secs=streaming_chunk_secs,
360
+ streaming_chunk_secs=None,
397
361
  )
398
362
 
399
363
  return caption
@@ -1,7 +1,13 @@
1
1
  """Configuration system for LattifAI using nemo_run."""
2
2
 
3
3
  from .alignment import AlignmentConfig
4
- from .caption import CaptionConfig
4
+ from .caption import (
5
+ CaptionConfig,
6
+ CaptionFonts,
7
+ CaptionStyle,
8
+ KaraokeConfig,
9
+ StandardizationConfig,
10
+ )
5
11
  from .client import ClientConfig
6
12
  from .diarization import DiarizationConfig
7
13
  from .media import AUDIO_FORMATS, MEDIA_FORMATS, VIDEO_FORMATS, MediaConfig
@@ -11,6 +17,10 @@ __all__ = [
11
17
  "ClientConfig",
12
18
  "AlignmentConfig",
13
19
  "CaptionConfig",
20
+ "CaptionFonts",
21
+ "CaptionStyle",
22
+ "KaraokeConfig",
23
+ "StandardizationConfig",
14
24
  "TranscriptionConfig",
15
25
  "DiarizationConfig",
16
26
  "MediaConfig",
@@ -28,11 +28,11 @@ class AlignmentConfig:
28
28
  """Computation device: 'cpu' for CPU, 'cuda' for NVIDIA GPU, 'mps' for Apple Silicon."""
29
29
 
30
30
  batch_size: int = 1
31
- """Batch size for inference (number of samples processed simultaneously)."""
31
+ """Batch size for inference (number of samples processed simultaneously, NotImplemented yet)."""
32
32
 
33
33
  # Segmented Alignment for Long Audio
34
34
  trust_caption_timestamps: bool = False
35
- """When True, use original caption timestamps as strong reference constraints during alignment.
35
+ """When True, use original caption.supervisions' timestamps as strong reference constraints during alignment.
36
36
  The alignment process will still adjust timestamps but stay close to the input timing.
37
37
  Use this when you want to re-segment caption sentence boundaries (caption.split_sentence=True)
38
38
  while preserving the approximate timing from the original captions.
@@ -93,6 +93,13 @@ class AlignmentConfig:
93
93
  Default: 0.20. Typical range: 0.0-0.5.
94
94
  """
95
95
 
96
+ boost: float = 5.0
97
+ """Boost for preferring supervisions over transcription in diff alignment decoding graph.
98
+ A positive value encourages the decoder to prefer supervision text over ASR transcription.
99
+ Only effective when strategy='transcription'. Has no effect with 'entire' or 'caption' strategies.
100
+ Default: 5.0. Typical range: 0.0-10.0.
101
+ """
102
+
96
103
  client_wrapper: Optional["SyncAPIClient"] = field(default=None, repr=False)
97
104
  """Reference to the SyncAPIClient instance. Auto-set during client initialization."""
98
105
 
@@ -1,29 +1,249 @@
1
1
  """Caption I/O configuration for LattifAI."""
2
2
 
3
- from dataclasses import dataclass
3
+ from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
- from typing import TYPE_CHECKING, Literal, Optional
5
+ from typing import TYPE_CHECKING, Dict, Literal, Optional, get_args
6
6
 
7
7
  from lhotse.utils import Pathlike
8
8
 
9
- # Supported caption formats for reading/writing
10
- CAPTION_FORMATS = ["srt", "vtt", "ass", "ssa", "sub", "sbv", "txt", "md", "ttml", "sami", "smi"]
9
+ # =============================================================================
10
+ # Caption Style Configuration Classes
11
+ # =============================================================================
11
12
 
12
- # Input caption formats (includes special formats like 'auto' and 'gemini')
13
- INPUT_CAPTION_FORMATS = ["srt", "vtt", "ass", "ssa", "sub", "sbv", "txt", "ttml", "sami", "smi", "auto", "gemini"]
14
13
 
15
- # Output caption formats (includes special formats like 'TextGrid' and 'json')
16
- OUTPUT_CAPTION_FORMATS = ["srt", "vtt", "ass", "ssa", "sub", "sbv", "txt", "ttml", "sami", "smi", "TextGrid", "json"]
14
+ class CaptionFonts:
15
+ """Common caption font constants.
16
+
17
+ These are reference constants for popular fonts. You can use any
18
+ system font name as the font_name parameter in CaptionStyle.
19
+ """
20
+
21
+ # Western fonts
22
+ ARIAL = "Arial"
23
+ IMPACT = "Impact"
24
+ VERDANA = "Verdana"
25
+ HELVETICA = "Helvetica"
26
+
27
+ # Chinese fonts
28
+ NOTO_SANS_SC = "Noto Sans SC"
29
+ MICROSOFT_YAHEI = "Microsoft YaHei"
30
+ PINGFANG_SC = "PingFang SC"
31
+ SIMHEI = "SimHei"
32
+
33
+ # Japanese fonts
34
+ NOTO_SANS_JP = "Noto Sans JP"
35
+ MEIRYO = "Meiryo"
36
+ HIRAGINO_SANS = "Hiragino Sans"
37
+
38
+ # Korean fonts
39
+ NOTO_SANS_KR = "Noto Sans KR"
40
+ MALGUN_GOTHIC = "Malgun Gothic"
41
+
42
+
43
+ @dataclass
44
+ class CaptionStyle:
45
+ """Caption style configuration for ASS/TTML formats.
46
+
47
+ Attributes:
48
+ primary_color: Main text color (#RRGGBB)
49
+ secondary_color: Secondary/highlight color (#RRGGBB)
50
+ outline_color: Text outline color (#RRGGBB)
51
+ back_color: Shadow color (#RRGGBB)
52
+ font_name: Font family name (use CaptionFonts constants or any system font)
53
+ font_size: Font size in points
54
+ bold: Enable bold text
55
+ italic: Enable italic text
56
+ outline_width: Outline thickness
57
+ shadow_depth: Shadow distance
58
+ alignment: ASS alignment (1-9, numpad style), 2=bottom-center
59
+ margin_l: Left margin in pixels
60
+ margin_r: Right margin in pixels
61
+ margin_v: Vertical margin in pixels
62
+ """
63
+
64
+ # Colors (#RRGGBB format)
65
+ primary_color: str = "#FFFFFF"
66
+ secondary_color: str = "#00FFFF"
67
+ outline_color: str = "#000000"
68
+ back_color: str = "#000000"
69
+
70
+ # Font
71
+ font_name: str = CaptionFonts.ARIAL
72
+ font_size: int = 48
73
+ bold: bool = False
74
+ italic: bool = False
75
+
76
+ # Border and shadow
77
+ outline_width: float = 2.0
78
+ shadow_depth: float = 1.0
79
+
80
+ # Position
81
+ alignment: int = 2
82
+ margin_l: int = 20
83
+ margin_r: int = 20
84
+ margin_v: int = 20
85
+
86
+
87
+ @dataclass
88
+ class KaraokeConfig:
89
+ """Karaoke export configuration.
90
+
91
+ Attributes:
92
+ enabled: Whether karaoke mode is enabled
93
+ effect: Karaoke effect type
94
+ - "sweep": Gradual fill from left to right (ASS \\kf tag)
95
+ - "instant": Instant highlight (ASS \\k tag)
96
+ - "outline": Outline then fill (ASS \\ko tag)
97
+ style: Caption style configuration (font, colors, position)
98
+ lrc_precision: LRC time precision ("centisecond" or "millisecond")
99
+ lrc_metadata: LRC metadata dict (ar, ti, al, etc.)
100
+ ttml_timing_mode: TTML timing attribute ("Word" or "Line")
101
+ """
102
+
103
+ enabled: bool = False
104
+ effect: Literal["sweep", "instant", "outline"] = "sweep"
105
+ style: CaptionStyle = field(default_factory=CaptionStyle)
106
+
107
+ # LRC specific
108
+ lrc_precision: Literal["centisecond", "millisecond"] = "millisecond"
109
+ lrc_metadata: Dict[str, str] = field(default_factory=dict)
110
+
111
+ # TTML specific
112
+ ttml_timing_mode: Literal["Word", "Line"] = "Word"
17
113
 
18
- # All caption formats combined (for file detection)
19
- ALL_CAPTION_FORMATS = list(set(CAPTION_FORMATS + ["TextGrid", "json", "gemini"]))
20
114
 
21
- # Type aliases for better type hints
22
- InputCaptionFormat = Literal["auto", "srt", "vtt", "ass", "ssa", "sub", "sbv", "txt", "ttml", "sami", "smi", "gemini"]
115
+ @dataclass
116
+ class StandardizationConfig:
117
+ """Caption standardization configuration following broadcast guidelines.
118
+
119
+ Reference Standards:
120
+ - Netflix Timed Text Style Guide
121
+ - BBC Subtitle Guidelines
122
+ - EBU-TT-D Standard
123
+
124
+ Attributes:
125
+ min_duration: Minimum segment duration (seconds). Netflix recommends 5/6s, BBC 0.3s
126
+ max_duration: Maximum segment duration (seconds). Netflix/BBC recommends 7s
127
+ min_gap: Minimum gap between segments (seconds). 80ms prevents subtitle flicker
128
+ max_lines: Maximum lines per segment. Broadcast standard is typically 2
129
+ max_chars_per_line: Maximum characters per line. CJK auto-adjusted by ÷2 (e.g., 42 → 21)
130
+ optimal_cps: Optimal reading speed (chars/sec). Netflix recommends 17-20 CPS
131
+ start_margin: Start margin (seconds) before first word. None = no adjustment (default)
132
+ end_margin: End margin (seconds) after last word. None = no adjustment (default)
133
+ margin_collision_mode: How to handle collisions: 'trim' (reduce margin) or 'gap' (maintain min_gap)
134
+ """
135
+
136
+ min_duration: float = 0.8
137
+ max_duration: float = 7.0
138
+ min_gap: float = 0.08
139
+ max_lines: int = 2
140
+ max_chars_per_line: int = 42
141
+ optimal_cps: float = 17.0
142
+ start_margin: Optional[float] = None
143
+ end_margin: Optional[float] = None
144
+ margin_collision_mode: Literal["trim", "gap"] = "trim"
145
+
146
+ def __post_init__(self):
147
+ """Validate configuration parameters."""
148
+ if self.min_duration <= 0:
149
+ raise ValueError("min_duration must be positive")
150
+ if self.max_duration <= self.min_duration:
151
+ raise ValueError("max_duration must be greater than min_duration")
152
+ if self.min_gap < 0:
153
+ raise ValueError("min_gap cannot be negative")
154
+ if self.max_lines < 1:
155
+ raise ValueError("max_lines must be at least 1")
156
+ if self.max_chars_per_line < 10:
157
+ raise ValueError("max_chars_per_line must be at least 10")
158
+ if self.start_margin is not None and self.start_margin < 0:
159
+ raise ValueError("start_margin cannot be negative")
160
+ if self.end_margin is not None and self.end_margin < 0:
161
+ raise ValueError("end_margin cannot be negative")
162
+ if self.margin_collision_mode not in ("trim", "gap"):
163
+ raise ValueError("margin_collision_mode must be 'trim' or 'gap'")
164
+
165
+
166
+ # =============================================================================
167
+ # Format Type Definitions (Single Source of Truth)
168
+ # =============================================================================
169
+
170
+ # Type alias for input caption formats (all formats with registered readers)
171
+ InputCaptionFormat = Literal[
172
+ # Standard subtitle formats
173
+ "srt",
174
+ "vtt", # WebVTT (auto-detects YouTube VTT with word-level timestamps)
175
+ "ass",
176
+ "ssa",
177
+ "sub",
178
+ "sbv",
179
+ "txt",
180
+ "sami",
181
+ "smi",
182
+ # Tabular formats
183
+ "csv",
184
+ "tsv",
185
+ "aud",
186
+ "json",
187
+ # Specialized formats
188
+ "textgrid", # Praat TextGrid
189
+ "gemini", # Gemini/YouTube transcript format
190
+ # Professional NLE formats
191
+ "avid_ds",
192
+ "fcpxml",
193
+ "premiere_xml",
194
+ "audition_csv",
195
+ # Special
196
+ "auto", # Auto-detect format
197
+ ]
198
+
199
+ # Type alias for output caption formats (all formats with registered writers)
23
200
  OutputCaptionFormat = Literal[
24
- "srt", "vtt", "ass", "ssa", "sub", "sbv", "txt", "ttml", "sami", "smi", "TextGrid", "json"
201
+ # Standard subtitle formats
202
+ "srt",
203
+ "vtt", # WebVTT (use karaoke_config.enabled=True for YouTube VTT style output)
204
+ "ass",
205
+ "ssa",
206
+ "sub",
207
+ "sbv",
208
+ "txt",
209
+ "sami",
210
+ "smi",
211
+ # Tabular formats
212
+ "csv",
213
+ "tsv",
214
+ "aud",
215
+ "json",
216
+ # Specialized formats
217
+ "textgrid", # Praat TextGrid
218
+ "gemini", # Gemini/YouTube transcript format
219
+ # TTML profiles (write-only)
220
+ "ttml", # Generic TTML
221
+ "imsc1", # IMSC1 (Netflix/streaming) TTML profile
222
+ "ebu_tt_d", # EBU-TT-D (European broadcast) TTML profile
223
+ # Professional NLE formats
224
+ "avid_ds", # Avid Media Composer SubCap format
225
+ "fcpxml", # Final Cut Pro XML
226
+ "premiere_xml", # Adobe Premiere Pro XML (graphic clips)
227
+ "audition_csv", # Adobe Audition markers
228
+ "edimarker_csv", # Pro Tools (via EdiMarker) markers
25
229
  ]
26
230
 
231
+ # =============================================================================
232
+ # Runtime Format Lists (Derived from Type Definitions)
233
+ # =============================================================================
234
+
235
+ # Input caption formats list (derived from InputCaptionFormat)
236
+ INPUT_CAPTION_FORMATS: list[str] = list(get_args(InputCaptionFormat))
237
+
238
+ # Output caption formats list (derived from OutputCaptionFormat)
239
+ OUTPUT_CAPTION_FORMATS: list[str] = list(get_args(OutputCaptionFormat))
240
+
241
+ # Standard caption formats (formats with both reader and writer)
242
+ CAPTION_FORMATS: list[str] = ["srt", "vtt", "ass", "ssa", "sub", "sbv", "txt", "sami", "smi"]
243
+
244
+ # All caption formats combined (for file detection, excludes "auto")
245
+ ALL_CAPTION_FORMATS: list[str] = list(set(INPUT_CAPTION_FORMATS + OUTPUT_CAPTION_FORMATS) - {"auto"})
246
+
27
247
 
28
248
  @dataclass
29
249
  class CaptionConfig:
@@ -34,13 +254,20 @@ class CaptionConfig:
34
254
  """
35
255
 
36
256
  input_format: InputCaptionFormat = "auto"
37
- """Input caption format: 'auto', 'srt', 'vtt', 'ass', 'txt', or 'json'."""
257
+ """Input caption format. Supports: 'auto' (detect),
258
+ standard formats (srt, vtt, ass, ssa, sub, sbv, txt, sami, smi),
259
+ tabular (csv, tsv, aud, json),
260
+ specialized (textgrid, gemini),
261
+ NLE (avid_ds, fcpxml, premiere_xml, audition_csv).
262
+ Note: VTT format auto-detects YouTube VTT with word-level timestamps.
263
+ """
38
264
 
39
265
  input_path: Optional[str] = None
40
266
  """Path to input caption file."""
41
267
 
42
268
  output_format: OutputCaptionFormat = "srt"
43
- """Output caption format: 'srt', 'vtt', 'ass', 'txt', or 'json'."""
269
+ """Output caption format. Supports: standard formats, tabular, specialized, TTML profiles (ttml, imsc1, ebu_tt_d),
270
+ NLE (avid_ds, fcpxml, premiere_xml, audition_csv, edimarker_csv)."""
44
271
 
45
272
  output_path: Optional[str] = None
46
273
  """Path to output caption file."""
@@ -57,12 +284,21 @@ class CaptionConfig:
57
284
  word_level: bool = False
58
285
  """Include word-level timestamps in alignment results (useful for karaoke, dubbing)."""
59
286
 
287
+ karaoke: Optional[KaraokeConfig] = None
288
+ """Karaoke configuration when word_level=True (e.g., ASS \\kf tags, enhanced LRC).
289
+ When None with word_level=True, outputs word-per-segment instead of karaoke styling.
290
+ When provided, karaoke.enabled controls whether karaoke styling is applied."""
291
+
60
292
  encoding: str = "utf-8"
61
293
  """Character encoding for reading/writing caption files (default: utf-8)."""
62
294
 
63
295
  source_lang: Optional[str] = None
64
296
  """Source language code for the caption content (e.g., 'en', 'zh', 'de')."""
65
297
 
298
+ standardization: Optional[StandardizationConfig] = None
299
+ """Standardization configuration for broadcast-grade captions.
300
+ When provided, captions will be standardized according to Netflix/BBC guidelines."""
301
+
66
302
  def __post_init__(self):
67
303
  """Validate configuration after initialization."""
68
304
  self._normalize_paths()
@@ -86,14 +322,17 @@ class CaptionConfig:
86
322
  return True
87
323
 
88
324
  def _normalize_paths(self) -> None:
89
- """Normalize and expand input/output paths."""
325
+ """Normalize and expand input/output paths.
326
+
327
+ Uses Path.resolve() to get absolute paths and prevent path traversal issues.
328
+ """
90
329
  # Expand and normalize input path if provided, but don't require it to exist yet
91
330
  # (it might be set later after downloading captions)
92
331
  if self.input_path is not None:
93
- self.input_path = str(Path(self.input_path).expanduser())
332
+ self.input_path = str(Path(self.input_path).expanduser().resolve())
94
333
 
95
334
  if self.output_path is not None:
96
- self.output_path = str(Path(self.output_path).expanduser())
335
+ self.output_path = str(Path(self.output_path).expanduser().resolve())
97
336
  output_dir = Path(self.output_path).parent
98
337
  output_dir.mkdir(parents=True, exist_ok=True)
99
338
 
@@ -154,7 +393,7 @@ class CaptionConfig:
154
393
  if not self.input_path:
155
394
  raise ValueError("input_path is required but not set in CaptionConfig")
156
395
 
157
- input_file = Path(self.input_path).expanduser()
396
+ input_file = Path(self.input_path).expanduser().resolve()
158
397
  if not input_file.exists():
159
398
  raise FileNotFoundError(
160
399
  f"Input caption file does not exist: '{input_file}'. " "Please check the path and try again."
@@ -164,15 +403,20 @@ class CaptionConfig:
164
403
  f"Input caption path is not a file: '{input_file}'. " "Expected a valid caption file path."
165
404
  )
166
405
 
167
- def check_sanity(self) -> bool:
168
- """Perform sanity checks on the configuration."""
169
- assert self.is_input_path_existed(), "Input caption path must be provided and exist."
406
+ def check_sanity(self) -> None:
407
+ """Perform sanity checks on the configuration.
408
+
409
+ Raises:
410
+ ValueError: If input path is not provided or does not exist.
411
+ """
412
+ if not self.is_input_path_existed():
413
+ raise ValueError("Input caption path must be provided and exist.")
170
414
 
171
415
  def is_input_path_existed(self) -> bool:
172
416
  """Check if input caption path is provided and exists."""
173
417
  if self.input_path is None:
174
418
  return False
175
419
 
176
- input_file = Path(self.input_path).expanduser()
420
+ input_file = Path(self.input_path).expanduser().resolve()
177
421
  self.input_path = str(input_file)
178
422
  return input_file.exists() and input_file.is_file()
lattifai/config/media.py CHANGED
@@ -91,6 +91,26 @@ class MediaConfig:
91
91
  force_overwrite: bool = False
92
92
  """Overwrite existing output files without prompting."""
93
93
 
94
+ audio_track_id: Optional[str] = "original"
95
+ """Audio track ID for multi-language YouTube videos.
96
+ - "original": Select the original audio track (default)
97
+ - Language code (e.g., "en", "ja", "fr"): Select by language
98
+ - Format ID (e.g., "251-drc", "140-0"): Select specific format
99
+ - None: No filtering, use yt-dlp default selection
100
+ """
101
+
102
+ quality: str = "best"
103
+ """Media quality for YouTube downloads.
104
+ For audio:
105
+ - "best": Highest bitrate (default)
106
+ - "medium": ~128 kbps
107
+ - "low": ~50 kbps
108
+ - Numeric string (e.g., "128"): Target bitrate in kbps
109
+ For video:
110
+ - "best": Highest resolution (default)
111
+ - "1080", "720", "480", "360": Target resolution
112
+ """
113
+
94
114
  def __post_init__(self) -> None:
95
115
  """Validate configuration and normalize paths/formats."""
96
116
  self._setup_output_directory()