lattifai 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/__init__.py +0 -24
- lattifai/alignment/__init__.py +10 -1
- lattifai/alignment/lattice1_aligner.py +66 -58
- lattifai/alignment/lattice1_worker.py +1 -6
- lattifai/alignment/punctuation.py +38 -0
- lattifai/alignment/segmenter.py +1 -1
- lattifai/alignment/sentence_splitter.py +350 -0
- lattifai/alignment/text_align.py +440 -0
- lattifai/alignment/tokenizer.py +91 -220
- lattifai/caption/__init__.py +82 -6
- lattifai/caption/caption.py +335 -1143
- lattifai/caption/formats/__init__.py +199 -0
- lattifai/caption/formats/base.py +211 -0
- lattifai/caption/formats/gemini.py +722 -0
- lattifai/caption/formats/json.py +194 -0
- lattifai/caption/formats/lrc.py +309 -0
- lattifai/caption/formats/nle/__init__.py +9 -0
- lattifai/caption/formats/nle/audition.py +561 -0
- lattifai/caption/formats/nle/avid.py +423 -0
- lattifai/caption/formats/nle/fcpxml.py +549 -0
- lattifai/caption/formats/nle/premiere.py +589 -0
- lattifai/caption/formats/pysubs2.py +642 -0
- lattifai/caption/formats/sbv.py +147 -0
- lattifai/caption/formats/tabular.py +338 -0
- lattifai/caption/formats/textgrid.py +193 -0
- lattifai/caption/formats/ttml.py +652 -0
- lattifai/caption/formats/vtt.py +469 -0
- lattifai/caption/parsers/__init__.py +9 -0
- lattifai/caption/{text_parser.py → parsers/text_parser.py} +4 -2
- lattifai/caption/standardize.py +636 -0
- lattifai/caption/utils.py +474 -0
- lattifai/cli/__init__.py +2 -1
- lattifai/cli/caption.py +108 -1
- lattifai/cli/transcribe.py +4 -9
- lattifai/cli/youtube.py +4 -1
- lattifai/client.py +48 -84
- lattifai/config/__init__.py +11 -1
- lattifai/config/alignment.py +9 -2
- lattifai/config/caption.py +267 -23
- lattifai/config/media.py +20 -0
- lattifai/diarization/__init__.py +41 -1
- lattifai/mixin.py +36 -18
- lattifai/transcription/base.py +6 -1
- lattifai/transcription/lattifai.py +19 -54
- lattifai/utils.py +81 -13
- lattifai/workflow/__init__.py +28 -4
- lattifai/workflow/file_manager.py +2 -5
- lattifai/youtube/__init__.py +43 -0
- lattifai/youtube/client.py +1170 -0
- lattifai/youtube/types.py +23 -0
- lattifai-1.2.2.dist-info/METADATA +615 -0
- lattifai-1.2.2.dist-info/RECORD +76 -0
- {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/entry_points.txt +1 -2
- lattifai/caption/gemini_reader.py +0 -371
- lattifai/caption/gemini_writer.py +0 -173
- lattifai/cli/app_installer.py +0 -142
- lattifai/cli/server.py +0 -44
- lattifai/server/app.py +0 -427
- lattifai/workflow/youtube.py +0 -577
- lattifai-1.2.0.dist-info/METADATA +0 -1133
- lattifai-1.2.0.dist-info/RECORD +0 -57
- {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/WHEEL +0 -0
- {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/top_level.txt +0 -0
lattifai/client.py
CHANGED
|
@@ -7,7 +7,7 @@ import colorful
|
|
|
7
7
|
from lattifai_core.client import SyncAPIClient
|
|
8
8
|
from lhotse.utils import Pathlike
|
|
9
9
|
|
|
10
|
-
from lattifai.alignment import Lattice1Aligner, Segmenter
|
|
10
|
+
from lattifai.alignment import Lattice1Aligner, Segmenter, align_supervisions_and_transcription
|
|
11
11
|
from lattifai.audio2 import AudioData, AudioLoader
|
|
12
12
|
from lattifai.caption import Caption, InputCaptionFormat
|
|
13
13
|
from lattifai.config import AlignmentConfig, CaptionConfig, ClientConfig, DiarizationConfig, TranscriptionConfig
|
|
@@ -123,86 +123,46 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
123
123
|
alignment_strategy = self.aligner.config.strategy
|
|
124
124
|
|
|
125
125
|
if alignment_strategy != "entire" or caption.transcription:
|
|
126
|
-
safe_print(colorful.cyan(f"🔄
|
|
126
|
+
safe_print(colorful.cyan(f"🔄 Using segmented alignment strategy: {alignment_strategy}"))
|
|
127
127
|
|
|
128
128
|
if caption.supervisions and alignment_strategy == "transcription":
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
caption.supervisions
|
|
135
|
-
), "Input caption should contain supervisions when using transcription-based alignment."
|
|
129
|
+
if "gemini" in self.transcriber.name.lower():
|
|
130
|
+
raise ValueError(
|
|
131
|
+
f"Transcription-based alignment is not supported for {self.transcriber.name} "
|
|
132
|
+
"(Gemini's timestamp is not reliable)."
|
|
133
|
+
)
|
|
136
134
|
if not caption.transcription:
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
Path(str(output_caption_path)).parent
|
|
143
|
-
/ f"{Path(str(media_audio)).stem}_{self.transcriber.file_name}"
|
|
144
|
-
)
|
|
145
|
-
if transcript_file.exists():
|
|
146
|
-
# print(colorful.cyan(f"Reading existing transcription from {transcript_file}"))
|
|
147
|
-
transcript = self._read_caption(transcript_file, verbose=False)
|
|
148
|
-
caption.transcription = transcript.supervisions
|
|
149
|
-
caption.audio_events = transcript.audio_events
|
|
150
|
-
|
|
151
|
-
if not caption.transcription:
|
|
152
|
-
transcript = asyncio.run(
|
|
153
|
-
self.transcriber.transcribe(media_audio, language=self.caption_config.source_lang)
|
|
154
|
-
)
|
|
155
|
-
caption.transcription = transcript.transcription
|
|
156
|
-
caption.audio_events = transcript.audio_events
|
|
157
|
-
|
|
158
|
-
# Align caption.supervisions with transcription to get segments
|
|
159
|
-
import regex
|
|
160
|
-
from error_align import ErrorAlign, error_align # noqa: F401
|
|
161
|
-
from error_align.utils import DELIMITERS, NUMERIC_TOKEN, STANDARD_TOKEN, OpType
|
|
162
|
-
|
|
163
|
-
JOIN_TOKEN = "❄"
|
|
164
|
-
if JOIN_TOKEN not in DELIMITERS:
|
|
165
|
-
DELIMITERS.add(JOIN_TOKEN)
|
|
166
|
-
|
|
167
|
-
def custom_tokenizer(text: str) -> list:
|
|
168
|
-
"""Default tokenizer that splits text into words based on whitespace.
|
|
169
|
-
|
|
170
|
-
Args:
|
|
171
|
-
text (str): The input text to tokenize.
|
|
172
|
-
|
|
173
|
-
Returns:
|
|
174
|
-
list: A list of tokens (words).
|
|
175
|
-
|
|
176
|
-
"""
|
|
177
|
-
# Escape JOIN_TOKEN for use in regex pattern
|
|
178
|
-
escaped_join_token = regex.escape(JOIN_TOKEN)
|
|
179
|
-
return list(
|
|
180
|
-
regex.finditer(
|
|
181
|
-
rf"({NUMERIC_TOKEN})|({STANDARD_TOKEN}|{escaped_join_token})",
|
|
182
|
-
text,
|
|
183
|
-
regex.UNICODE | regex.VERBOSE,
|
|
184
|
-
)
|
|
135
|
+
transcript = self._transcribe(
|
|
136
|
+
media_audio,
|
|
137
|
+
source_lang=self.caption_config.source_lang,
|
|
138
|
+
is_async=False,
|
|
139
|
+
output_dir=Path(str(output_caption_path)).parent if output_caption_path else None,
|
|
185
140
|
)
|
|
141
|
+
caption.transcription = transcript.supervisions or transcript.transcription
|
|
142
|
+
caption.audio_events = transcript.audio_events
|
|
143
|
+
if not caption.transcription:
|
|
144
|
+
raise ValueError("Transcription is empty after transcription step.")
|
|
186
145
|
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
f"{JOIN_TOKEN}".join(sup.text for sup in caption.transcription),
|
|
190
|
-
tokenizer=custom_tokenizer,
|
|
191
|
-
)
|
|
192
|
-
|
|
193
|
-
for align in alignments:
|
|
194
|
-
if align.hyp == JOIN_TOKEN and align.op_type == OpType.MATCH:
|
|
195
|
-
pass
|
|
146
|
+
if split_sentence or self.caption_config.split_sentence:
|
|
147
|
+
caption.supervisions = self.aligner.tokenizer.split_sentences(caption.supervisions)
|
|
196
148
|
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
# # print(colorful.yellow(f"⚠️ Alignment warning: {op}"))
|
|
201
|
-
# pass
|
|
149
|
+
matches = align_supervisions_and_transcription(
|
|
150
|
+
caption, max_duration=media_audio.duration, verbose=True
|
|
151
|
+
)
|
|
202
152
|
|
|
203
|
-
|
|
153
|
+
skipalign = False
|
|
154
|
+
matches = sorted(matches, key=lambda x: x[2].WER.WER) # sort by WER
|
|
155
|
+
segments = [(m[3].start[1], m[3].end[1], m, skipalign) for m in matches]
|
|
156
|
+
for segment in segments:
|
|
157
|
+
# transcription segments -> sentence splitting
|
|
158
|
+
segment[2][1] = self.aligner.tokenizer.split_sentences(segment[2][1])
|
|
204
159
|
else:
|
|
205
160
|
if caption.transcription:
|
|
161
|
+
if "gemini" in self.transcriber.name.lower():
|
|
162
|
+
raise ValueError(
|
|
163
|
+
f"Transcription-based alignment is not supported for {self.transcriber.name} "
|
|
164
|
+
"(Gemini's timestamp is not reliable)."
|
|
165
|
+
)
|
|
206
166
|
if not caption.supervisions: # youtube + transcription case
|
|
207
167
|
segments = [(sup.start, sup.end, [sup], not sup.text) for sup in caption.transcription]
|
|
208
168
|
else:
|
|
@@ -220,9 +180,10 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
220
180
|
)
|
|
221
181
|
|
|
222
182
|
# align each segment
|
|
183
|
+
sr = media_audio.sampling_rate
|
|
223
184
|
supervisions, alignments = [], []
|
|
224
185
|
for i, (start, end, _supervisions, skipalign) in enumerate(segments, 1):
|
|
225
|
-
|
|
186
|
+
safe_print(
|
|
226
187
|
colorful.green(
|
|
227
188
|
f" ⏩ aligning segment {i:04d}/{len(segments):04d}: {start:8.2f}s - {end:8.2f}s"
|
|
228
189
|
)
|
|
@@ -234,18 +195,15 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
234
195
|
|
|
235
196
|
offset = round(start, 4)
|
|
236
197
|
# Extract audio slice
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
]
|
|
240
|
-
emission = self.aligner.emission(audio_slice_ndarray)
|
|
198
|
+
audio_slice = media_audio.ndarray[:, int(start * sr) : int(end * sr)]
|
|
199
|
+
emission = self.aligner.emission(audio_slice)
|
|
241
200
|
|
|
242
201
|
# Align segment
|
|
243
202
|
_supervisions, _alignments = self.aligner.alignment(
|
|
244
203
|
media_audio,
|
|
245
204
|
_supervisions,
|
|
246
205
|
split_sentence=split_sentence or self.caption_config.split_sentence,
|
|
247
|
-
return_details=
|
|
248
|
-
or (output_caption_path and str(output_caption_path).endswith(".TextGrid")),
|
|
206
|
+
return_details=True,
|
|
249
207
|
emission=emission,
|
|
250
208
|
offset=offset,
|
|
251
209
|
verbose=False,
|
|
@@ -253,14 +211,16 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
253
211
|
|
|
254
212
|
supervisions.extend(_supervisions)
|
|
255
213
|
alignments.extend(_alignments)
|
|
214
|
+
|
|
215
|
+
# sort by start
|
|
216
|
+
alignments = sorted(alignments, key=lambda x: x.start)
|
|
256
217
|
else:
|
|
257
218
|
# Step 2-4: Standard single-pass alignment
|
|
258
219
|
supervisions, alignments = self.aligner.alignment(
|
|
259
220
|
media_audio,
|
|
260
221
|
caption.supervisions,
|
|
261
222
|
split_sentence=split_sentence or self.caption_config.split_sentence,
|
|
262
|
-
return_details=
|
|
263
|
-
or (output_caption_path and str(output_caption_path).endswith(".TextGrid")),
|
|
223
|
+
return_details=True,
|
|
264
224
|
)
|
|
265
225
|
|
|
266
226
|
# Update caption with aligned results
|
|
@@ -358,6 +318,8 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
358
318
|
use_transcription: bool = False,
|
|
359
319
|
channel_selector: Optional[str | int] = "average",
|
|
360
320
|
streaming_chunk_secs: Optional[float] = None,
|
|
321
|
+
audio_track_id: Optional[str] = "original",
|
|
322
|
+
quality: str = "best",
|
|
361
323
|
) -> Caption:
|
|
362
324
|
# Prepare output directory and media format
|
|
363
325
|
output_dir = self._prepare_youtube_output_dir(output_dir)
|
|
@@ -366,9 +328,11 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
366
328
|
safe_print(colorful.cyan(f"🎬 Starting YouTube workflow for: {url}"))
|
|
367
329
|
|
|
368
330
|
# Step 1: Download media
|
|
369
|
-
media_file = self._download_media_sync(url, output_dir, media_format, force_overwrite)
|
|
331
|
+
media_file = self._download_media_sync(url, output_dir, media_format, force_overwrite, audio_track_id, quality)
|
|
370
332
|
|
|
371
|
-
media_audio = self.audio_loader(
|
|
333
|
+
media_audio = self.audio_loader(
|
|
334
|
+
media_file, channel_selector=channel_selector, streaming_chunk_secs=streaming_chunk_secs
|
|
335
|
+
)
|
|
372
336
|
|
|
373
337
|
# Step 2: Get or create captions (download or transcribe)
|
|
374
338
|
caption = self._download_or_transcribe_caption(
|
|
@@ -393,7 +357,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
393
357
|
output_caption_path=output_caption_path,
|
|
394
358
|
split_sentence=split_sentence,
|
|
395
359
|
channel_selector=channel_selector,
|
|
396
|
-
streaming_chunk_secs=
|
|
360
|
+
streaming_chunk_secs=None,
|
|
397
361
|
)
|
|
398
362
|
|
|
399
363
|
return caption
|
lattifai/config/__init__.py
CHANGED
|
@@ -1,7 +1,13 @@
|
|
|
1
1
|
"""Configuration system for LattifAI using nemo_run."""
|
|
2
2
|
|
|
3
3
|
from .alignment import AlignmentConfig
|
|
4
|
-
from .caption import
|
|
4
|
+
from .caption import (
|
|
5
|
+
CaptionConfig,
|
|
6
|
+
CaptionFonts,
|
|
7
|
+
CaptionStyle,
|
|
8
|
+
KaraokeConfig,
|
|
9
|
+
StandardizationConfig,
|
|
10
|
+
)
|
|
5
11
|
from .client import ClientConfig
|
|
6
12
|
from .diarization import DiarizationConfig
|
|
7
13
|
from .media import AUDIO_FORMATS, MEDIA_FORMATS, VIDEO_FORMATS, MediaConfig
|
|
@@ -11,6 +17,10 @@ __all__ = [
|
|
|
11
17
|
"ClientConfig",
|
|
12
18
|
"AlignmentConfig",
|
|
13
19
|
"CaptionConfig",
|
|
20
|
+
"CaptionFonts",
|
|
21
|
+
"CaptionStyle",
|
|
22
|
+
"KaraokeConfig",
|
|
23
|
+
"StandardizationConfig",
|
|
14
24
|
"TranscriptionConfig",
|
|
15
25
|
"DiarizationConfig",
|
|
16
26
|
"MediaConfig",
|
lattifai/config/alignment.py
CHANGED
|
@@ -28,11 +28,11 @@ class AlignmentConfig:
|
|
|
28
28
|
"""Computation device: 'cpu' for CPU, 'cuda' for NVIDIA GPU, 'mps' for Apple Silicon."""
|
|
29
29
|
|
|
30
30
|
batch_size: int = 1
|
|
31
|
-
"""Batch size for inference (number of samples processed simultaneously)."""
|
|
31
|
+
"""Batch size for inference (number of samples processed simultaneously, NotImplemented yet)."""
|
|
32
32
|
|
|
33
33
|
# Segmented Alignment for Long Audio
|
|
34
34
|
trust_caption_timestamps: bool = False
|
|
35
|
-
"""When True, use original caption timestamps as strong reference constraints during alignment.
|
|
35
|
+
"""When True, use original caption.supervisions' timestamps as strong reference constraints during alignment.
|
|
36
36
|
The alignment process will still adjust timestamps but stay close to the input timing.
|
|
37
37
|
Use this when you want to re-segment caption sentence boundaries (caption.split_sentence=True)
|
|
38
38
|
while preserving the approximate timing from the original captions.
|
|
@@ -93,6 +93,13 @@ class AlignmentConfig:
|
|
|
93
93
|
Default: 0.20. Typical range: 0.0-0.5.
|
|
94
94
|
"""
|
|
95
95
|
|
|
96
|
+
boost: float = 5.0
|
|
97
|
+
"""Boost for preferring supervisions over transcription in diff alignment decoding graph.
|
|
98
|
+
A positive value encourages the decoder to prefer supervision text over ASR transcription.
|
|
99
|
+
Only effective when strategy='transcription'. Has no effect with 'entire' or 'caption' strategies.
|
|
100
|
+
Default: 5.0. Typical range: 0.0-10.0.
|
|
101
|
+
"""
|
|
102
|
+
|
|
96
103
|
client_wrapper: Optional["SyncAPIClient"] = field(default=None, repr=False)
|
|
97
104
|
"""Reference to the SyncAPIClient instance. Auto-set during client initialization."""
|
|
98
105
|
|
lattifai/config/caption.py
CHANGED
|
@@ -1,29 +1,249 @@
|
|
|
1
1
|
"""Caption I/O configuration for LattifAI."""
|
|
2
2
|
|
|
3
|
-
from dataclasses import dataclass
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import TYPE_CHECKING, Literal, Optional
|
|
5
|
+
from typing import TYPE_CHECKING, Dict, Literal, Optional, get_args
|
|
6
6
|
|
|
7
7
|
from lhotse.utils import Pathlike
|
|
8
8
|
|
|
9
|
-
#
|
|
10
|
-
|
|
9
|
+
# =============================================================================
|
|
10
|
+
# Caption Style Configuration Classes
|
|
11
|
+
# =============================================================================
|
|
11
12
|
|
|
12
|
-
# Input caption formats (includes special formats like 'auto' and 'gemini')
|
|
13
|
-
INPUT_CAPTION_FORMATS = ["srt", "vtt", "ass", "ssa", "sub", "sbv", "txt", "ttml", "sami", "smi", "auto", "gemini"]
|
|
14
13
|
|
|
15
|
-
|
|
16
|
-
|
|
14
|
+
class CaptionFonts:
|
|
15
|
+
"""Common caption font constants.
|
|
16
|
+
|
|
17
|
+
These are reference constants for popular fonts. You can use any
|
|
18
|
+
system font name as the font_name parameter in CaptionStyle.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
# Western fonts
|
|
22
|
+
ARIAL = "Arial"
|
|
23
|
+
IMPACT = "Impact"
|
|
24
|
+
VERDANA = "Verdana"
|
|
25
|
+
HELVETICA = "Helvetica"
|
|
26
|
+
|
|
27
|
+
# Chinese fonts
|
|
28
|
+
NOTO_SANS_SC = "Noto Sans SC"
|
|
29
|
+
MICROSOFT_YAHEI = "Microsoft YaHei"
|
|
30
|
+
PINGFANG_SC = "PingFang SC"
|
|
31
|
+
SIMHEI = "SimHei"
|
|
32
|
+
|
|
33
|
+
# Japanese fonts
|
|
34
|
+
NOTO_SANS_JP = "Noto Sans JP"
|
|
35
|
+
MEIRYO = "Meiryo"
|
|
36
|
+
HIRAGINO_SANS = "Hiragino Sans"
|
|
37
|
+
|
|
38
|
+
# Korean fonts
|
|
39
|
+
NOTO_SANS_KR = "Noto Sans KR"
|
|
40
|
+
MALGUN_GOTHIC = "Malgun Gothic"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class CaptionStyle:
|
|
45
|
+
"""Caption style configuration for ASS/TTML formats.
|
|
46
|
+
|
|
47
|
+
Attributes:
|
|
48
|
+
primary_color: Main text color (#RRGGBB)
|
|
49
|
+
secondary_color: Secondary/highlight color (#RRGGBB)
|
|
50
|
+
outline_color: Text outline color (#RRGGBB)
|
|
51
|
+
back_color: Shadow color (#RRGGBB)
|
|
52
|
+
font_name: Font family name (use CaptionFonts constants or any system font)
|
|
53
|
+
font_size: Font size in points
|
|
54
|
+
bold: Enable bold text
|
|
55
|
+
italic: Enable italic text
|
|
56
|
+
outline_width: Outline thickness
|
|
57
|
+
shadow_depth: Shadow distance
|
|
58
|
+
alignment: ASS alignment (1-9, numpad style), 2=bottom-center
|
|
59
|
+
margin_l: Left margin in pixels
|
|
60
|
+
margin_r: Right margin in pixels
|
|
61
|
+
margin_v: Vertical margin in pixels
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
# Colors (#RRGGBB format)
|
|
65
|
+
primary_color: str = "#FFFFFF"
|
|
66
|
+
secondary_color: str = "#00FFFF"
|
|
67
|
+
outline_color: str = "#000000"
|
|
68
|
+
back_color: str = "#000000"
|
|
69
|
+
|
|
70
|
+
# Font
|
|
71
|
+
font_name: str = CaptionFonts.ARIAL
|
|
72
|
+
font_size: int = 48
|
|
73
|
+
bold: bool = False
|
|
74
|
+
italic: bool = False
|
|
75
|
+
|
|
76
|
+
# Border and shadow
|
|
77
|
+
outline_width: float = 2.0
|
|
78
|
+
shadow_depth: float = 1.0
|
|
79
|
+
|
|
80
|
+
# Position
|
|
81
|
+
alignment: int = 2
|
|
82
|
+
margin_l: int = 20
|
|
83
|
+
margin_r: int = 20
|
|
84
|
+
margin_v: int = 20
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@dataclass
|
|
88
|
+
class KaraokeConfig:
|
|
89
|
+
"""Karaoke export configuration.
|
|
90
|
+
|
|
91
|
+
Attributes:
|
|
92
|
+
enabled: Whether karaoke mode is enabled
|
|
93
|
+
effect: Karaoke effect type
|
|
94
|
+
- "sweep": Gradual fill from left to right (ASS \\kf tag)
|
|
95
|
+
- "instant": Instant highlight (ASS \\k tag)
|
|
96
|
+
- "outline": Outline then fill (ASS \\ko tag)
|
|
97
|
+
style: Caption style configuration (font, colors, position)
|
|
98
|
+
lrc_precision: LRC time precision ("centisecond" or "millisecond")
|
|
99
|
+
lrc_metadata: LRC metadata dict (ar, ti, al, etc.)
|
|
100
|
+
ttml_timing_mode: TTML timing attribute ("Word" or "Line")
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
enabled: bool = False
|
|
104
|
+
effect: Literal["sweep", "instant", "outline"] = "sweep"
|
|
105
|
+
style: CaptionStyle = field(default_factory=CaptionStyle)
|
|
106
|
+
|
|
107
|
+
# LRC specific
|
|
108
|
+
lrc_precision: Literal["centisecond", "millisecond"] = "millisecond"
|
|
109
|
+
lrc_metadata: Dict[str, str] = field(default_factory=dict)
|
|
110
|
+
|
|
111
|
+
# TTML specific
|
|
112
|
+
ttml_timing_mode: Literal["Word", "Line"] = "Word"
|
|
17
113
|
|
|
18
|
-
# All caption formats combined (for file detection)
|
|
19
|
-
ALL_CAPTION_FORMATS = list(set(CAPTION_FORMATS + ["TextGrid", "json", "gemini"]))
|
|
20
114
|
|
|
21
|
-
|
|
22
|
-
|
|
115
|
+
@dataclass
|
|
116
|
+
class StandardizationConfig:
|
|
117
|
+
"""Caption standardization configuration following broadcast guidelines.
|
|
118
|
+
|
|
119
|
+
Reference Standards:
|
|
120
|
+
- Netflix Timed Text Style Guide
|
|
121
|
+
- BBC Subtitle Guidelines
|
|
122
|
+
- EBU-TT-D Standard
|
|
123
|
+
|
|
124
|
+
Attributes:
|
|
125
|
+
min_duration: Minimum segment duration (seconds). Netflix recommends 5/6s, BBC 0.3s
|
|
126
|
+
max_duration: Maximum segment duration (seconds). Netflix/BBC recommends 7s
|
|
127
|
+
min_gap: Minimum gap between segments (seconds). 80ms prevents subtitle flicker
|
|
128
|
+
max_lines: Maximum lines per segment. Broadcast standard is typically 2
|
|
129
|
+
max_chars_per_line: Maximum characters per line. CJK auto-adjusted by ÷2 (e.g., 42 → 21)
|
|
130
|
+
optimal_cps: Optimal reading speed (chars/sec). Netflix recommends 17-20 CPS
|
|
131
|
+
start_margin: Start margin (seconds) before first word. None = no adjustment (default)
|
|
132
|
+
end_margin: End margin (seconds) after last word. None = no adjustment (default)
|
|
133
|
+
margin_collision_mode: How to handle collisions: 'trim' (reduce margin) or 'gap' (maintain min_gap)
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
min_duration: float = 0.8
|
|
137
|
+
max_duration: float = 7.0
|
|
138
|
+
min_gap: float = 0.08
|
|
139
|
+
max_lines: int = 2
|
|
140
|
+
max_chars_per_line: int = 42
|
|
141
|
+
optimal_cps: float = 17.0
|
|
142
|
+
start_margin: Optional[float] = None
|
|
143
|
+
end_margin: Optional[float] = None
|
|
144
|
+
margin_collision_mode: Literal["trim", "gap"] = "trim"
|
|
145
|
+
|
|
146
|
+
def __post_init__(self):
|
|
147
|
+
"""Validate configuration parameters."""
|
|
148
|
+
if self.min_duration <= 0:
|
|
149
|
+
raise ValueError("min_duration must be positive")
|
|
150
|
+
if self.max_duration <= self.min_duration:
|
|
151
|
+
raise ValueError("max_duration must be greater than min_duration")
|
|
152
|
+
if self.min_gap < 0:
|
|
153
|
+
raise ValueError("min_gap cannot be negative")
|
|
154
|
+
if self.max_lines < 1:
|
|
155
|
+
raise ValueError("max_lines must be at least 1")
|
|
156
|
+
if self.max_chars_per_line < 10:
|
|
157
|
+
raise ValueError("max_chars_per_line must be at least 10")
|
|
158
|
+
if self.start_margin is not None and self.start_margin < 0:
|
|
159
|
+
raise ValueError("start_margin cannot be negative")
|
|
160
|
+
if self.end_margin is not None and self.end_margin < 0:
|
|
161
|
+
raise ValueError("end_margin cannot be negative")
|
|
162
|
+
if self.margin_collision_mode not in ("trim", "gap"):
|
|
163
|
+
raise ValueError("margin_collision_mode must be 'trim' or 'gap'")
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
# =============================================================================
|
|
167
|
+
# Format Type Definitions (Single Source of Truth)
|
|
168
|
+
# =============================================================================
|
|
169
|
+
|
|
170
|
+
# Type alias for input caption formats (all formats with registered readers)
|
|
171
|
+
InputCaptionFormat = Literal[
|
|
172
|
+
# Standard subtitle formats
|
|
173
|
+
"srt",
|
|
174
|
+
"vtt", # WebVTT (auto-detects YouTube VTT with word-level timestamps)
|
|
175
|
+
"ass",
|
|
176
|
+
"ssa",
|
|
177
|
+
"sub",
|
|
178
|
+
"sbv",
|
|
179
|
+
"txt",
|
|
180
|
+
"sami",
|
|
181
|
+
"smi",
|
|
182
|
+
# Tabular formats
|
|
183
|
+
"csv",
|
|
184
|
+
"tsv",
|
|
185
|
+
"aud",
|
|
186
|
+
"json",
|
|
187
|
+
# Specialized formats
|
|
188
|
+
"textgrid", # Praat TextGrid
|
|
189
|
+
"gemini", # Gemini/YouTube transcript format
|
|
190
|
+
# Professional NLE formats
|
|
191
|
+
"avid_ds",
|
|
192
|
+
"fcpxml",
|
|
193
|
+
"premiere_xml",
|
|
194
|
+
"audition_csv",
|
|
195
|
+
# Special
|
|
196
|
+
"auto", # Auto-detect format
|
|
197
|
+
]
|
|
198
|
+
|
|
199
|
+
# Type alias for output caption formats (all formats with registered writers)
|
|
23
200
|
OutputCaptionFormat = Literal[
|
|
24
|
-
|
|
201
|
+
# Standard subtitle formats
|
|
202
|
+
"srt",
|
|
203
|
+
"vtt", # WebVTT (use karaoke_config.enabled=True for YouTube VTT style output)
|
|
204
|
+
"ass",
|
|
205
|
+
"ssa",
|
|
206
|
+
"sub",
|
|
207
|
+
"sbv",
|
|
208
|
+
"txt",
|
|
209
|
+
"sami",
|
|
210
|
+
"smi",
|
|
211
|
+
# Tabular formats
|
|
212
|
+
"csv",
|
|
213
|
+
"tsv",
|
|
214
|
+
"aud",
|
|
215
|
+
"json",
|
|
216
|
+
# Specialized formats
|
|
217
|
+
"textgrid", # Praat TextGrid
|
|
218
|
+
"gemini", # Gemini/YouTube transcript format
|
|
219
|
+
# TTML profiles (write-only)
|
|
220
|
+
"ttml", # Generic TTML
|
|
221
|
+
"imsc1", # IMSC1 (Netflix/streaming) TTML profile
|
|
222
|
+
"ebu_tt_d", # EBU-TT-D (European broadcast) TTML profile
|
|
223
|
+
# Professional NLE formats
|
|
224
|
+
"avid_ds", # Avid Media Composer SubCap format
|
|
225
|
+
"fcpxml", # Final Cut Pro XML
|
|
226
|
+
"premiere_xml", # Adobe Premiere Pro XML (graphic clips)
|
|
227
|
+
"audition_csv", # Adobe Audition markers
|
|
228
|
+
"edimarker_csv", # Pro Tools (via EdiMarker) markers
|
|
25
229
|
]
|
|
26
230
|
|
|
231
|
+
# =============================================================================
|
|
232
|
+
# Runtime Format Lists (Derived from Type Definitions)
|
|
233
|
+
# =============================================================================
|
|
234
|
+
|
|
235
|
+
# Input caption formats list (derived from InputCaptionFormat)
|
|
236
|
+
INPUT_CAPTION_FORMATS: list[str] = list(get_args(InputCaptionFormat))
|
|
237
|
+
|
|
238
|
+
# Output caption formats list (derived from OutputCaptionFormat)
|
|
239
|
+
OUTPUT_CAPTION_FORMATS: list[str] = list(get_args(OutputCaptionFormat))
|
|
240
|
+
|
|
241
|
+
# Standard caption formats (formats with both reader and writer)
|
|
242
|
+
CAPTION_FORMATS: list[str] = ["srt", "vtt", "ass", "ssa", "sub", "sbv", "txt", "sami", "smi"]
|
|
243
|
+
|
|
244
|
+
# All caption formats combined (for file detection, excludes "auto")
|
|
245
|
+
ALL_CAPTION_FORMATS: list[str] = list(set(INPUT_CAPTION_FORMATS + OUTPUT_CAPTION_FORMATS) - {"auto"})
|
|
246
|
+
|
|
27
247
|
|
|
28
248
|
@dataclass
|
|
29
249
|
class CaptionConfig:
|
|
@@ -34,13 +254,20 @@ class CaptionConfig:
|
|
|
34
254
|
"""
|
|
35
255
|
|
|
36
256
|
input_format: InputCaptionFormat = "auto"
|
|
37
|
-
"""Input caption format: 'auto'
|
|
257
|
+
"""Input caption format. Supports: 'auto' (detect),
|
|
258
|
+
standard formats (srt, vtt, ass, ssa, sub, sbv, txt, sami, smi),
|
|
259
|
+
tabular (csv, tsv, aud, json),
|
|
260
|
+
specialized (textgrid, gemini),
|
|
261
|
+
NLE (avid_ds, fcpxml, premiere_xml, audition_csv).
|
|
262
|
+
Note: VTT format auto-detects YouTube VTT with word-level timestamps.
|
|
263
|
+
"""
|
|
38
264
|
|
|
39
265
|
input_path: Optional[str] = None
|
|
40
266
|
"""Path to input caption file."""
|
|
41
267
|
|
|
42
268
|
output_format: OutputCaptionFormat = "srt"
|
|
43
|
-
"""Output caption format:
|
|
269
|
+
"""Output caption format. Supports: standard formats, tabular, specialized, TTML profiles (ttml, imsc1, ebu_tt_d),
|
|
270
|
+
NLE (avid_ds, fcpxml, premiere_xml, audition_csv, edimarker_csv)."""
|
|
44
271
|
|
|
45
272
|
output_path: Optional[str] = None
|
|
46
273
|
"""Path to output caption file."""
|
|
@@ -57,12 +284,21 @@ class CaptionConfig:
|
|
|
57
284
|
word_level: bool = False
|
|
58
285
|
"""Include word-level timestamps in alignment results (useful for karaoke, dubbing)."""
|
|
59
286
|
|
|
287
|
+
karaoke: Optional[KaraokeConfig] = None
|
|
288
|
+
"""Karaoke configuration when word_level=True (e.g., ASS \\kf tags, enhanced LRC).
|
|
289
|
+
When None with word_level=True, outputs word-per-segment instead of karaoke styling.
|
|
290
|
+
When provided, karaoke.enabled controls whether karaoke styling is applied."""
|
|
291
|
+
|
|
60
292
|
encoding: str = "utf-8"
|
|
61
293
|
"""Character encoding for reading/writing caption files (default: utf-8)."""
|
|
62
294
|
|
|
63
295
|
source_lang: Optional[str] = None
|
|
64
296
|
"""Source language code for the caption content (e.g., 'en', 'zh', 'de')."""
|
|
65
297
|
|
|
298
|
+
standardization: Optional[StandardizationConfig] = None
|
|
299
|
+
"""Standardization configuration for broadcast-grade captions.
|
|
300
|
+
When provided, captions will be standardized according to Netflix/BBC guidelines."""
|
|
301
|
+
|
|
66
302
|
def __post_init__(self):
|
|
67
303
|
"""Validate configuration after initialization."""
|
|
68
304
|
self._normalize_paths()
|
|
@@ -86,14 +322,17 @@ class CaptionConfig:
|
|
|
86
322
|
return True
|
|
87
323
|
|
|
88
324
|
def _normalize_paths(self) -> None:
|
|
89
|
-
"""Normalize and expand input/output paths.
|
|
325
|
+
"""Normalize and expand input/output paths.
|
|
326
|
+
|
|
327
|
+
Uses Path.resolve() to get absolute paths and prevent path traversal issues.
|
|
328
|
+
"""
|
|
90
329
|
# Expand and normalize input path if provided, but don't require it to exist yet
|
|
91
330
|
# (it might be set later after downloading captions)
|
|
92
331
|
if self.input_path is not None:
|
|
93
|
-
self.input_path = str(Path(self.input_path).expanduser())
|
|
332
|
+
self.input_path = str(Path(self.input_path).expanduser().resolve())
|
|
94
333
|
|
|
95
334
|
if self.output_path is not None:
|
|
96
|
-
self.output_path = str(Path(self.output_path).expanduser())
|
|
335
|
+
self.output_path = str(Path(self.output_path).expanduser().resolve())
|
|
97
336
|
output_dir = Path(self.output_path).parent
|
|
98
337
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
99
338
|
|
|
@@ -154,7 +393,7 @@ class CaptionConfig:
|
|
|
154
393
|
if not self.input_path:
|
|
155
394
|
raise ValueError("input_path is required but not set in CaptionConfig")
|
|
156
395
|
|
|
157
|
-
input_file = Path(self.input_path).expanduser()
|
|
396
|
+
input_file = Path(self.input_path).expanduser().resolve()
|
|
158
397
|
if not input_file.exists():
|
|
159
398
|
raise FileNotFoundError(
|
|
160
399
|
f"Input caption file does not exist: '{input_file}'. " "Please check the path and try again."
|
|
@@ -164,15 +403,20 @@ class CaptionConfig:
|
|
|
164
403
|
f"Input caption path is not a file: '{input_file}'. " "Expected a valid caption file path."
|
|
165
404
|
)
|
|
166
405
|
|
|
167
|
-
def check_sanity(self) ->
|
|
168
|
-
"""Perform sanity checks on the configuration.
|
|
169
|
-
|
|
406
|
+
def check_sanity(self) -> None:
|
|
407
|
+
"""Perform sanity checks on the configuration.
|
|
408
|
+
|
|
409
|
+
Raises:
|
|
410
|
+
ValueError: If input path is not provided or does not exist.
|
|
411
|
+
"""
|
|
412
|
+
if not self.is_input_path_existed():
|
|
413
|
+
raise ValueError("Input caption path must be provided and exist.")
|
|
170
414
|
|
|
171
415
|
def is_input_path_existed(self) -> bool:
|
|
172
416
|
"""Check if input caption path is provided and exists."""
|
|
173
417
|
if self.input_path is None:
|
|
174
418
|
return False
|
|
175
419
|
|
|
176
|
-
input_file = Path(self.input_path).expanduser()
|
|
420
|
+
input_file = Path(self.input_path).expanduser().resolve()
|
|
177
421
|
self.input_path = str(input_file)
|
|
178
422
|
return input_file.exists() and input_file.is_file()
|
lattifai/config/media.py
CHANGED
|
@@ -91,6 +91,26 @@ class MediaConfig:
|
|
|
91
91
|
force_overwrite: bool = False
|
|
92
92
|
"""Overwrite existing output files without prompting."""
|
|
93
93
|
|
|
94
|
+
audio_track_id: Optional[str] = "original"
|
|
95
|
+
"""Audio track ID for multi-language YouTube videos.
|
|
96
|
+
- "original": Select the original audio track (default)
|
|
97
|
+
- Language code (e.g., "en", "ja", "fr"): Select by language
|
|
98
|
+
- Format ID (e.g., "251-drc", "140-0"): Select specific format
|
|
99
|
+
- None: No filtering, use yt-dlp default selection
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
quality: str = "best"
|
|
103
|
+
"""Media quality for YouTube downloads.
|
|
104
|
+
For audio:
|
|
105
|
+
- "best": Highest bitrate (default)
|
|
106
|
+
- "medium": ~128 kbps
|
|
107
|
+
- "low": ~50 kbps
|
|
108
|
+
- Numeric string (e.g., "128"): Target bitrate in kbps
|
|
109
|
+
For video:
|
|
110
|
+
- "best": Highest resolution (default)
|
|
111
|
+
- "1080", "720", "480", "360": Target resolution
|
|
112
|
+
"""
|
|
113
|
+
|
|
94
114
|
def __post_init__(self) -> None:
|
|
95
115
|
"""Validate configuration and normalize paths/formats."""
|
|
96
116
|
self._setup_output_directory()
|