lattifai 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/_init.py +20 -0
- lattifai/alignment/__init__.py +9 -1
- lattifai/alignment/lattice1_aligner.py +175 -54
- lattifai/alignment/lattice1_worker.py +47 -4
- lattifai/alignment/punctuation.py +38 -0
- lattifai/alignment/segmenter.py +3 -2
- lattifai/alignment/text_align.py +441 -0
- lattifai/alignment/tokenizer.py +134 -65
- lattifai/audio2.py +162 -183
- lattifai/cli/__init__.py +2 -1
- lattifai/cli/alignment.py +5 -0
- lattifai/cli/caption.py +111 -4
- lattifai/cli/transcribe.py +2 -6
- lattifai/cli/youtube.py +7 -1
- lattifai/client.py +72 -123
- lattifai/config/__init__.py +28 -0
- lattifai/config/alignment.py +14 -0
- lattifai/config/caption.py +45 -31
- lattifai/config/client.py +16 -0
- lattifai/config/event.py +102 -0
- lattifai/config/media.py +20 -0
- lattifai/config/transcription.py +25 -1
- lattifai/data/__init__.py +8 -0
- lattifai/data/caption.py +228 -0
- lattifai/diarization/__init__.py +41 -1
- lattifai/errors.py +78 -53
- lattifai/event/__init__.py +65 -0
- lattifai/event/lattifai.py +166 -0
- lattifai/mixin.py +49 -32
- lattifai/transcription/base.py +8 -2
- lattifai/transcription/gemini.py +147 -16
- lattifai/transcription/lattifai.py +25 -63
- lattifai/types.py +1 -1
- lattifai/utils.py +7 -13
- lattifai/workflow/__init__.py +28 -4
- lattifai/workflow/file_manager.py +2 -5
- lattifai/youtube/__init__.py +43 -0
- lattifai/youtube/client.py +1265 -0
- lattifai/youtube/types.py +23 -0
- lattifai-1.3.0.dist-info/METADATA +678 -0
- lattifai-1.3.0.dist-info/RECORD +57 -0
- {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/entry_points.txt +1 -2
- lattifai/__init__.py +0 -88
- lattifai/alignment/sentence_splitter.py +0 -219
- lattifai/caption/__init__.py +0 -20
- lattifai/caption/caption.py +0 -1467
- lattifai/caption/gemini_reader.py +0 -462
- lattifai/caption/gemini_writer.py +0 -173
- lattifai/caption/supervision.py +0 -34
- lattifai/caption/text_parser.py +0 -145
- lattifai/cli/app_installer.py +0 -142
- lattifai/cli/server.py +0 -44
- lattifai/server/app.py +0 -427
- lattifai/workflow/youtube.py +0 -577
- lattifai-1.2.1.dist-info/METADATA +0 -1134
- lattifai-1.2.1.dist-info/RECORD +0 -58
- {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/WHEEL +0 -0
- {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/top_level.txt +0 -0
lattifai/cli/youtube.py
CHANGED
|
@@ -11,6 +11,7 @@ from lattifai.config import (
|
|
|
11
11
|
CaptionConfig,
|
|
12
12
|
ClientConfig,
|
|
13
13
|
DiarizationConfig,
|
|
14
|
+
EventConfig,
|
|
14
15
|
MediaConfig,
|
|
15
16
|
TranscriptionConfig,
|
|
16
17
|
)
|
|
@@ -25,6 +26,7 @@ def youtube(
|
|
|
25
26
|
caption: Annotated[Optional[CaptionConfig], run.Config[CaptionConfig]] = None,
|
|
26
27
|
transcription: Annotated[Optional[TranscriptionConfig], run.Config[TranscriptionConfig]] = None,
|
|
27
28
|
diarization: Annotated[Optional[DiarizationConfig], run.Config[DiarizationConfig]] = None,
|
|
29
|
+
event: Annotated[Optional[EventConfig], run.Config[EventConfig]] = None,
|
|
28
30
|
use_transcription: bool = False,
|
|
29
31
|
):
|
|
30
32
|
"""
|
|
@@ -44,7 +46,8 @@ def youtube(
|
|
|
44
46
|
Args:
|
|
45
47
|
yt_url: YouTube video URL (can be provided as positional argument)
|
|
46
48
|
media: Media configuration for controlling formats and output directories.
|
|
47
|
-
Fields: input_path (YouTube URL), output_dir, output_format, force_overwrite
|
|
49
|
+
Fields: input_path (YouTube URL), output_dir, output_format, force_overwrite,
|
|
50
|
+
audio_track_id (default: "original"), quality (default: "best")
|
|
48
51
|
client: API client configuration.
|
|
49
52
|
Fields: api_key, timeout, max_retries
|
|
50
53
|
alignment: Alignment configuration (model selection and inference settings).
|
|
@@ -113,6 +116,7 @@ def youtube(
|
|
|
113
116
|
caption_config=caption_config,
|
|
114
117
|
transcription_config=transcription,
|
|
115
118
|
diarization_config=diarization,
|
|
119
|
+
event_config=event,
|
|
116
120
|
)
|
|
117
121
|
|
|
118
122
|
# Call the client's youtube method
|
|
@@ -129,6 +133,8 @@ def youtube(
|
|
|
129
133
|
channel_selector=media_config.channel_selector,
|
|
130
134
|
streaming_chunk_secs=media_config.streaming_chunk_secs,
|
|
131
135
|
use_transcription=use_transcription,
|
|
136
|
+
audio_track_id=media_config.audio_track_id,
|
|
137
|
+
quality=media_config.quality,
|
|
132
138
|
)
|
|
133
139
|
|
|
134
140
|
|
lattifai/client.py
CHANGED
|
@@ -9,8 +9,16 @@ from lhotse.utils import Pathlike
|
|
|
9
9
|
|
|
10
10
|
from lattifai.alignment import Lattice1Aligner, Segmenter
|
|
11
11
|
from lattifai.audio2 import AudioData, AudioLoader
|
|
12
|
-
from lattifai.caption import
|
|
13
|
-
from lattifai.config import
|
|
12
|
+
from lattifai.caption import InputCaptionFormat
|
|
13
|
+
from lattifai.config import (
|
|
14
|
+
AlignmentConfig,
|
|
15
|
+
CaptionConfig,
|
|
16
|
+
ClientConfig,
|
|
17
|
+
DiarizationConfig,
|
|
18
|
+
EventConfig,
|
|
19
|
+
TranscriptionConfig,
|
|
20
|
+
)
|
|
21
|
+
from lattifai.data import Caption
|
|
14
22
|
from lattifai.errors import (
|
|
15
23
|
AlignmentError,
|
|
16
24
|
CaptionProcessingError,
|
|
@@ -22,6 +30,7 @@ from lattifai.utils import safe_print
|
|
|
22
30
|
|
|
23
31
|
if TYPE_CHECKING:
|
|
24
32
|
from lattifai.diarization import LattifAIDiarizer # noqa: F401
|
|
33
|
+
from lattifai.event import LattifAIEventDetector # noqa: F401
|
|
25
34
|
|
|
26
35
|
|
|
27
36
|
class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
@@ -41,6 +50,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
41
50
|
caption_config: Optional[CaptionConfig] = None,
|
|
42
51
|
transcription_config: Optional[TranscriptionConfig] = None,
|
|
43
52
|
diarization_config: Optional[DiarizationConfig] = None,
|
|
53
|
+
event_config: Optional[EventConfig] = None,
|
|
44
54
|
) -> None:
|
|
45
55
|
__doc__ = LattifAIClientMixin._INIT_DOC.format(
|
|
46
56
|
client_class="LattifAI",
|
|
@@ -59,8 +69,8 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
59
69
|
self.config = client_config
|
|
60
70
|
|
|
61
71
|
# Initialize all configs with defaults
|
|
62
|
-
alignment_config, transcription_config, diarization_config = self._init_configs(
|
|
63
|
-
alignment_config, transcription_config, diarization_config
|
|
72
|
+
alignment_config, transcription_config, diarization_config, event_config = self._init_configs(
|
|
73
|
+
alignment_config, transcription_config, diarization_config, event_config
|
|
64
74
|
)
|
|
65
75
|
|
|
66
76
|
# Store configs
|
|
@@ -82,6 +92,14 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
82
92
|
|
|
83
93
|
self.diarizer = LattifAIDiarizer(config=self.diarization_config)
|
|
84
94
|
|
|
95
|
+
# Initialize event detector if enabled
|
|
96
|
+
self.event_config = event_config
|
|
97
|
+
self.event_detector: Optional["LattifAIEventDetector"] = None
|
|
98
|
+
if self.event_config.enabled:
|
|
99
|
+
from lattifai.event import LattifAIEventDetector # noqa: F811
|
|
100
|
+
|
|
101
|
+
self.event_detector = LattifAIEventDetector(config=self.event_config)
|
|
102
|
+
|
|
85
103
|
# Initialize shared components (transcriber, downloader)
|
|
86
104
|
self._init_shared_components(transcription_config)
|
|
87
105
|
|
|
@@ -123,9 +141,16 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
123
141
|
alignment_strategy = self.aligner.config.strategy
|
|
124
142
|
|
|
125
143
|
if alignment_strategy != "entire" or caption.transcription:
|
|
126
|
-
safe_print(colorful.cyan(f"🔄
|
|
144
|
+
safe_print(colorful.cyan(f"🔄 Using segmented alignment strategy: {alignment_strategy}"))
|
|
127
145
|
|
|
128
146
|
if caption.supervisions and alignment_strategy == "transcription":
|
|
147
|
+
from lattifai.alignment.text_align import align_supervisions_and_transcription
|
|
148
|
+
|
|
149
|
+
if "gemini" in self.transcriber.name.lower():
|
|
150
|
+
raise ValueError(
|
|
151
|
+
f"Transcription-based alignment is not supported for {self.transcriber.name} "
|
|
152
|
+
"(Gemini's timestamp is not reliable)."
|
|
153
|
+
)
|
|
129
154
|
if not caption.transcription:
|
|
130
155
|
transcript = self._transcribe(
|
|
131
156
|
media_audio,
|
|
@@ -134,118 +159,30 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
134
159
|
output_dir=Path(str(output_caption_path)).parent if output_caption_path else None,
|
|
135
160
|
)
|
|
136
161
|
caption.transcription = transcript.supervisions or transcript.transcription
|
|
137
|
-
caption.
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
# Align caption.supervisions with transcription to get segments
|
|
141
|
-
import regex
|
|
142
|
-
from error_align import error_align # noqa: F401
|
|
143
|
-
from error_align.utils import DELIMITERS, NUMERIC_TOKEN, STANDARD_TOKEN, OpType
|
|
144
|
-
|
|
145
|
-
JOIN_TOKEN = "❄"
|
|
146
|
-
if JOIN_TOKEN not in DELIMITERS:
|
|
147
|
-
DELIMITERS.add(JOIN_TOKEN)
|
|
148
|
-
|
|
149
|
-
def custom_tokenizer(text: str) -> list:
|
|
150
|
-
"""Default tokenizer that splits text into words based on whitespace.
|
|
151
|
-
|
|
152
|
-
Args:
|
|
153
|
-
text (str): The input text to tokenize.
|
|
154
|
-
|
|
155
|
-
Returns:
|
|
156
|
-
list: A list of tokens (words).
|
|
157
|
-
|
|
158
|
-
"""
|
|
159
|
-
# Escape JOIN_TOKEN for use in regex pattern
|
|
160
|
-
escaped_join_token = regex.escape(JOIN_TOKEN)
|
|
161
|
-
return list(
|
|
162
|
-
regex.finditer(
|
|
163
|
-
rf"({NUMERIC_TOKEN})|({STANDARD_TOKEN}|{escaped_join_token})",
|
|
164
|
-
text,
|
|
165
|
-
regex.UNICODE | regex.VERBOSE,
|
|
166
|
-
)
|
|
167
|
-
)
|
|
162
|
+
caption.event = transcript.event
|
|
163
|
+
if not caption.transcription:
|
|
164
|
+
raise ValueError("Transcription is empty after transcription step.")
|
|
168
165
|
|
|
169
166
|
if split_sentence or self.caption_config.split_sentence:
|
|
170
167
|
caption.supervisions = self.aligner.tokenizer.split_sentences(caption.supervisions)
|
|
171
168
|
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
ref_start = 0
|
|
183
|
-
for i in range(idx, k + 1):
|
|
184
|
-
if i < len(alignments) and alignments[i].ref_slice is not None:
|
|
185
|
-
ref_start = alignments[i].ref_slice.start
|
|
186
|
-
break
|
|
187
|
-
|
|
188
|
-
# Find last non-None ref_slice up to current position
|
|
189
|
-
ref_stop = len(ref)
|
|
190
|
-
for i in range(k, idx - 1, -1):
|
|
191
|
-
if i < len(alignments) and alignments[i].ref_slice is not None:
|
|
192
|
-
ref_stop = alignments[i].ref_slice.stop
|
|
193
|
-
break
|
|
194
|
-
|
|
195
|
-
# Find first non-None hyp_slice starting from idx
|
|
196
|
-
hyp_start = 0
|
|
197
|
-
for i in range(idx, k + 1):
|
|
198
|
-
if i < len(alignments) and alignments[i].hyp_slice is not None:
|
|
199
|
-
hyp_start = alignments[i].hyp_slice.start
|
|
200
|
-
break
|
|
201
|
-
|
|
202
|
-
# Find last non-None hyp_slice up to current position
|
|
203
|
-
hyp_stop = len(hyp)
|
|
204
|
-
for i in range(k, idx - 1, -1):
|
|
205
|
-
if i < len(alignments) and alignments[i].hyp_slice is not None:
|
|
206
|
-
hyp_stop = alignments[i].hyp_slice.stop
|
|
207
|
-
break
|
|
208
|
-
|
|
209
|
-
safe_print(f"[REF]: {ref[ref_start:ref_stop]}")
|
|
210
|
-
safe_print(f"[HYP]: {hyp[hyp_start:hyp_stop]}\n")
|
|
211
|
-
idx = k + 1
|
|
212
|
-
|
|
213
|
-
# last part - handle remaining alignments after last JOIN_TOKEN
|
|
214
|
-
if idx < len(alignments):
|
|
215
|
-
# Find first non-None ref_slice starting from idx
|
|
216
|
-
ref_start = 0
|
|
217
|
-
for i in range(idx, len(alignments)):
|
|
218
|
-
if alignments[i].ref_slice is not None:
|
|
219
|
-
ref_start = alignments[i].ref_slice.start
|
|
220
|
-
break
|
|
221
|
-
|
|
222
|
-
# Find last non-None ref_slice from end
|
|
223
|
-
ref_stop = len(ref)
|
|
224
|
-
for i in range(len(alignments) - 1, idx - 1, -1):
|
|
225
|
-
if alignments[i].ref_slice is not None:
|
|
226
|
-
ref_stop = alignments[i].ref_slice.stop
|
|
227
|
-
break
|
|
228
|
-
|
|
229
|
-
# Find first non-None hyp_slice starting from idx
|
|
230
|
-
hyp_start = 0
|
|
231
|
-
for i in range(idx, len(alignments)):
|
|
232
|
-
if alignments[i].hyp_slice is not None:
|
|
233
|
-
hyp_start = alignments[i].hyp_slice.start
|
|
234
|
-
break
|
|
235
|
-
|
|
236
|
-
# Find last non-None hyp_slice from end
|
|
237
|
-
hyp_stop = len(hyp)
|
|
238
|
-
for i in range(len(alignments) - 1, idx - 1, -1):
|
|
239
|
-
if alignments[i].hyp_slice is not None:
|
|
240
|
-
hyp_stop = alignments[i].hyp_slice.stop
|
|
241
|
-
break
|
|
242
|
-
|
|
243
|
-
safe_print(f"[REF]: {ref[ref_start:ref_stop + 1]}")
|
|
244
|
-
safe_print(f"[HYP]: {hyp[hyp_start:hyp_stop + 1]}\n")
|
|
245
|
-
|
|
246
|
-
raise NotImplementedError("Transcription-based segmentation is not yet implemented.")
|
|
169
|
+
matches = align_supervisions_and_transcription(
|
|
170
|
+
caption, max_duration=media_audio.duration, verbose=True
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
skipalign = False
|
|
174
|
+
matches = sorted(matches, key=lambda x: x[2].WER.WER) # sort by WER
|
|
175
|
+
segments = [(m[3].start[1], m[3].end[1], m, skipalign) for m in matches]
|
|
176
|
+
for segment in segments:
|
|
177
|
+
# transcription segments -> sentence splitting
|
|
178
|
+
segment[2][1] = self.aligner.tokenizer.split_sentences(segment[2][1])
|
|
247
179
|
else:
|
|
248
180
|
if caption.transcription:
|
|
181
|
+
if "gemini" in self.transcriber.name.lower():
|
|
182
|
+
raise ValueError(
|
|
183
|
+
f"Transcription-based alignment is not supported for {self.transcriber.name} "
|
|
184
|
+
"(Gemini's timestamp is not reliable)."
|
|
185
|
+
)
|
|
249
186
|
if not caption.supervisions: # youtube + transcription case
|
|
250
187
|
segments = [(sup.start, sup.end, [sup], not sup.text) for sup in caption.transcription]
|
|
251
188
|
else:
|
|
@@ -266,7 +203,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
266
203
|
sr = media_audio.sampling_rate
|
|
267
204
|
supervisions, alignments = [], []
|
|
268
205
|
for i, (start, end, _supervisions, skipalign) in enumerate(segments, 1):
|
|
269
|
-
|
|
206
|
+
safe_print(
|
|
270
207
|
colorful.green(
|
|
271
208
|
f" ⏩ aligning segment {i:04d}/{len(segments):04d}: {start:8.2f}s - {end:8.2f}s"
|
|
272
209
|
)
|
|
@@ -286,8 +223,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
286
223
|
media_audio,
|
|
287
224
|
_supervisions,
|
|
288
225
|
split_sentence=split_sentence or self.caption_config.split_sentence,
|
|
289
|
-
return_details=
|
|
290
|
-
or (output_caption_path and str(output_caption_path).endswith(".TextGrid")),
|
|
226
|
+
return_details=True,
|
|
291
227
|
emission=emission,
|
|
292
228
|
offset=offset,
|
|
293
229
|
verbose=False,
|
|
@@ -295,14 +231,16 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
295
231
|
|
|
296
232
|
supervisions.extend(_supervisions)
|
|
297
233
|
alignments.extend(_alignments)
|
|
234
|
+
|
|
235
|
+
# sort by start
|
|
236
|
+
alignments = sorted(alignments, key=lambda x: x.start)
|
|
298
237
|
else:
|
|
299
238
|
# Step 2-4: Standard single-pass alignment
|
|
300
239
|
supervisions, alignments = self.aligner.alignment(
|
|
301
240
|
media_audio,
|
|
302
241
|
caption.supervisions,
|
|
303
242
|
split_sentence=split_sentence or self.caption_config.split_sentence,
|
|
304
|
-
return_details=
|
|
305
|
-
or (output_caption_path and str(output_caption_path).endswith(".TextGrid")),
|
|
243
|
+
return_details=True,
|
|
306
244
|
)
|
|
307
245
|
|
|
308
246
|
# Update caption with aligned results
|
|
@@ -316,13 +254,15 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
316
254
|
if self.config.profile:
|
|
317
255
|
self.aligner.profile()
|
|
318
256
|
|
|
319
|
-
except (CaptionProcessingError, LatticeEncodingError
|
|
257
|
+
except (CaptionProcessingError, LatticeEncodingError) as e:
|
|
320
258
|
# Re-raise our specific errors as-is
|
|
321
|
-
raise
|
|
259
|
+
raise e
|
|
260
|
+
except LatticeDecodingError as e:
|
|
261
|
+
raise e
|
|
322
262
|
except Exception as e:
|
|
323
263
|
# Catch any unexpected errors and wrap them
|
|
324
264
|
raise AlignmentError(
|
|
325
|
-
"Unexpected error during alignment process",
|
|
265
|
+
message="Unexpected error during alignment process",
|
|
326
266
|
media_path=str(input_media),
|
|
327
267
|
caption_path=str(input_caption),
|
|
328
268
|
context={"original_error": str(e), "error_type": e.__class__.__name__},
|
|
@@ -337,6 +277,13 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
337
277
|
output_caption_path=output_caption_path,
|
|
338
278
|
)
|
|
339
279
|
|
|
280
|
+
# Step 6: Event detection
|
|
281
|
+
if self.event_config.enabled and self.event_detector:
|
|
282
|
+
safe_print(colorful.cyan("🔊 Performing audio event detection..."))
|
|
283
|
+
caption = self.event_detector.detect_and_update_caption(caption, media_audio)
|
|
284
|
+
if output_caption_path:
|
|
285
|
+
self._write_caption(caption, output_caption_path)
|
|
286
|
+
|
|
340
287
|
return caption
|
|
341
288
|
|
|
342
289
|
def speaker_diarization(
|
|
@@ -367,12 +314,12 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
367
314
|
diarization_file = Path(str(output_caption_path)).with_suffix(".SpkDiar")
|
|
368
315
|
if diarization_file.exists():
|
|
369
316
|
safe_print(colorful.cyan(f"Reading existing speaker diarization from {diarization_file}"))
|
|
370
|
-
caption.
|
|
317
|
+
caption.read_diarization(diarization_file)
|
|
371
318
|
|
|
372
319
|
diarization, alignments = self.diarizer.diarize_with_alignments(
|
|
373
320
|
input_media,
|
|
374
321
|
caption.alignments,
|
|
375
|
-
diarization=caption.
|
|
322
|
+
diarization=caption.diarization,
|
|
376
323
|
alignment_fn=self.aligner.alignment,
|
|
377
324
|
transcribe_fn=self.transcriber.transcribe_numpy if self.transcriber else None,
|
|
378
325
|
separate_fn=self.aligner.separate if self.aligner.worker.separator_ort else None,
|
|
@@ -380,7 +327,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
380
327
|
output_path=output_caption_path,
|
|
381
328
|
)
|
|
382
329
|
caption.alignments = alignments
|
|
383
|
-
caption.
|
|
330
|
+
caption.diarization = diarization
|
|
384
331
|
|
|
385
332
|
# Write output if requested
|
|
386
333
|
if output_caption_path:
|
|
@@ -400,6 +347,8 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
400
347
|
use_transcription: bool = False,
|
|
401
348
|
channel_selector: Optional[str | int] = "average",
|
|
402
349
|
streaming_chunk_secs: Optional[float] = None,
|
|
350
|
+
audio_track_id: Optional[str] = "original",
|
|
351
|
+
quality: str = "best",
|
|
403
352
|
) -> Caption:
|
|
404
353
|
# Prepare output directory and media format
|
|
405
354
|
output_dir = self._prepare_youtube_output_dir(output_dir)
|
|
@@ -408,7 +357,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
408
357
|
safe_print(colorful.cyan(f"🎬 Starting YouTube workflow for: {url}"))
|
|
409
358
|
|
|
410
359
|
# Step 1: Download media
|
|
411
|
-
media_file = self._download_media_sync(url, output_dir, media_format, force_overwrite)
|
|
360
|
+
media_file = self._download_media_sync(url, output_dir, media_format, force_overwrite, audio_track_id, quality)
|
|
412
361
|
|
|
413
362
|
media_audio = self.audio_loader(
|
|
414
363
|
media_file, channel_selector=channel_selector, streaming_chunk_secs=streaming_chunk_secs
|
lattifai/config/__init__.py
CHANGED
|
@@ -1,16 +1,44 @@
|
|
|
1
1
|
"""Configuration system for LattifAI using nemo_run."""
|
|
2
2
|
|
|
3
|
+
# Re-export caption config classes from lattifai-captions package
|
|
4
|
+
from lattifai.caption.config import (
|
|
5
|
+
ALL_CAPTION_FORMATS,
|
|
6
|
+
CAPTION_FORMATS,
|
|
7
|
+
INPUT_CAPTION_FORMATS,
|
|
8
|
+
OUTPUT_CAPTION_FORMATS,
|
|
9
|
+
CaptionFonts,
|
|
10
|
+
CaptionStyle,
|
|
11
|
+
InputCaptionFormat,
|
|
12
|
+
KaraokeConfig,
|
|
13
|
+
OutputCaptionFormat,
|
|
14
|
+
StandardizationConfig,
|
|
15
|
+
)
|
|
16
|
+
|
|
3
17
|
from .alignment import AlignmentConfig
|
|
18
|
+
|
|
19
|
+
# CaptionConfig is defined in lattifai-python (workflow config)
|
|
4
20
|
from .caption import CaptionConfig
|
|
5
21
|
from .client import ClientConfig
|
|
6
22
|
from .diarization import DiarizationConfig
|
|
23
|
+
from .event import EventConfig
|
|
7
24
|
from .media import AUDIO_FORMATS, MEDIA_FORMATS, VIDEO_FORMATS, MediaConfig
|
|
8
25
|
from .transcription import TranscriptionConfig
|
|
9
26
|
|
|
10
27
|
__all__ = [
|
|
28
|
+
"EventConfig",
|
|
11
29
|
"ClientConfig",
|
|
12
30
|
"AlignmentConfig",
|
|
13
31
|
"CaptionConfig",
|
|
32
|
+
"CaptionFonts",
|
|
33
|
+
"CaptionStyle",
|
|
34
|
+
"KaraokeConfig",
|
|
35
|
+
"StandardizationConfig",
|
|
36
|
+
"InputCaptionFormat",
|
|
37
|
+
"OutputCaptionFormat",
|
|
38
|
+
"INPUT_CAPTION_FORMATS",
|
|
39
|
+
"OUTPUT_CAPTION_FORMATS",
|
|
40
|
+
"ALL_CAPTION_FORMATS",
|
|
41
|
+
"CAPTION_FORMATS",
|
|
14
42
|
"TranscriptionConfig",
|
|
15
43
|
"DiarizationConfig",
|
|
16
44
|
"MediaConfig",
|
lattifai/config/alignment.py
CHANGED
|
@@ -93,6 +93,20 @@ class AlignmentConfig:
|
|
|
93
93
|
Default: 0.20. Typical range: 0.0-0.5.
|
|
94
94
|
"""
|
|
95
95
|
|
|
96
|
+
boost: float = 5.0
|
|
97
|
+
"""Boost for preferring supervisions over transcription in diff alignment decoding graph.
|
|
98
|
+
A positive value encourages the decoder to prefer supervision text over ASR transcription.
|
|
99
|
+
Only effective when strategy='transcription'. Has no effect with 'entire' or 'caption' strategies.
|
|
100
|
+
Default: 5.0. Typical range: 0.0-10.0.
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
transition_penalty: float = 0.0
|
|
104
|
+
"""Penalty for token transitions in the decoding graph to discourage duration=1 tokens.
|
|
105
|
+
A negative value penalizes transitions (moving to next token), making the model prefer
|
|
106
|
+
self-loops (staying on current token longer). This helps prevent spurious short-duration alignments.
|
|
107
|
+
Default: 0.0 (no penalty). Typical range: -1.0 to 0.0 (e.g., -0.5).
|
|
108
|
+
"""
|
|
109
|
+
|
|
96
110
|
client_wrapper: Optional["SyncAPIClient"] = field(default=None, repr=False)
|
|
97
111
|
"""Reference to the SyncAPIClient instance. Auto-set during client initialization."""
|
|
98
112
|
|
lattifai/config/caption.py
CHANGED
|
@@ -1,28 +1,18 @@
|
|
|
1
|
-
"""Caption I/O configuration for LattifAI."""
|
|
1
|
+
"""Caption I/O configuration for LattifAI SDK."""
|
|
2
2
|
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import
|
|
5
|
+
from typing import Optional
|
|
6
6
|
|
|
7
|
-
from
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
OUTPUT_CAPTION_FORMATS = ["srt", "vtt", "ass", "ssa", "sub", "sbv", "txt", "ttml", "sami", "smi", "TextGrid", "json"]
|
|
17
|
-
|
|
18
|
-
# All caption formats combined (for file detection)
|
|
19
|
-
ALL_CAPTION_FORMATS = list(set(CAPTION_FORMATS + ["TextGrid", "json", "gemini"]))
|
|
20
|
-
|
|
21
|
-
# Type aliases for better type hints
|
|
22
|
-
InputCaptionFormat = Literal["auto", "srt", "vtt", "ass", "ssa", "sub", "sbv", "txt", "ttml", "sami", "smi", "gemini"]
|
|
23
|
-
OutputCaptionFormat = Literal[
|
|
24
|
-
"srt", "vtt", "ass", "ssa", "sub", "sbv", "txt", "ttml", "sami", "smi", "TextGrid", "json"
|
|
25
|
-
]
|
|
7
|
+
from lattifai.caption.config import (
|
|
8
|
+
INPUT_CAPTION_FORMATS,
|
|
9
|
+
OUTPUT_CAPTION_FORMATS,
|
|
10
|
+
InputCaptionFormat,
|
|
11
|
+
KaraokeConfig,
|
|
12
|
+
OutputCaptionFormat,
|
|
13
|
+
StandardizationConfig,
|
|
14
|
+
)
|
|
15
|
+
from lattifai.caption.supervision import Pathlike
|
|
26
16
|
|
|
27
17
|
|
|
28
18
|
@dataclass
|
|
@@ -34,13 +24,20 @@ class CaptionConfig:
|
|
|
34
24
|
"""
|
|
35
25
|
|
|
36
26
|
input_format: InputCaptionFormat = "auto"
|
|
37
|
-
"""Input caption format: 'auto'
|
|
27
|
+
"""Input caption format. Supports: 'auto' (detect),
|
|
28
|
+
standard formats (srt, vtt, ass, ssa, sub, sbv, txt, sami, smi),
|
|
29
|
+
tabular (csv, tsv, aud, json),
|
|
30
|
+
specialized (textgrid, gemini),
|
|
31
|
+
NLE (avid_ds, fcpxml, premiere_xml, audition_csv).
|
|
32
|
+
Note: VTT format auto-detects YouTube VTT with word-level timestamps.
|
|
33
|
+
"""
|
|
38
34
|
|
|
39
35
|
input_path: Optional[str] = None
|
|
40
36
|
"""Path to input caption file."""
|
|
41
37
|
|
|
42
38
|
output_format: OutputCaptionFormat = "srt"
|
|
43
|
-
"""Output caption format:
|
|
39
|
+
"""Output caption format. Supports: standard formats, tabular, specialized, TTML profiles (ttml, imsc1, ebu_tt_d),
|
|
40
|
+
NLE (avid_ds, fcpxml, premiere_xml, audition_csv, edimarker_csv)."""
|
|
44
41
|
|
|
45
42
|
output_path: Optional[str] = None
|
|
46
43
|
"""Path to output caption file."""
|
|
@@ -57,12 +54,21 @@ class CaptionConfig:
|
|
|
57
54
|
word_level: bool = False
|
|
58
55
|
"""Include word-level timestamps in alignment results (useful for karaoke, dubbing)."""
|
|
59
56
|
|
|
57
|
+
karaoke: Optional[KaraokeConfig] = None
|
|
58
|
+
"""Karaoke configuration when word_level=True (e.g., ASS \\kf tags, enhanced LRC).
|
|
59
|
+
When None with word_level=True, outputs word-per-segment instead of karaoke styling.
|
|
60
|
+
When provided, karaoke.enabled controls whether karaoke styling is applied."""
|
|
61
|
+
|
|
60
62
|
encoding: str = "utf-8"
|
|
61
63
|
"""Character encoding for reading/writing caption files (default: utf-8)."""
|
|
62
64
|
|
|
63
65
|
source_lang: Optional[str] = None
|
|
64
66
|
"""Source language code for the caption content (e.g., 'en', 'zh', 'de')."""
|
|
65
67
|
|
|
68
|
+
standardization: Optional[StandardizationConfig] = None
|
|
69
|
+
"""Standardization configuration for broadcast-grade captions.
|
|
70
|
+
When provided, captions will be standardized according to Netflix/BBC guidelines."""
|
|
71
|
+
|
|
66
72
|
def __post_init__(self):
|
|
67
73
|
"""Validate configuration after initialization."""
|
|
68
74
|
self._normalize_paths()
|
|
@@ -86,14 +92,17 @@ class CaptionConfig:
|
|
|
86
92
|
return True
|
|
87
93
|
|
|
88
94
|
def _normalize_paths(self) -> None:
|
|
89
|
-
"""Normalize and expand input/output paths.
|
|
95
|
+
"""Normalize and expand input/output paths.
|
|
96
|
+
|
|
97
|
+
Uses Path.resolve() to get absolute paths and prevent path traversal issues.
|
|
98
|
+
"""
|
|
90
99
|
# Expand and normalize input path if provided, but don't require it to exist yet
|
|
91
100
|
# (it might be set later after downloading captions)
|
|
92
101
|
if self.input_path is not None:
|
|
93
|
-
self.input_path = str(Path(self.input_path).expanduser())
|
|
102
|
+
self.input_path = str(Path(self.input_path).expanduser().resolve())
|
|
94
103
|
|
|
95
104
|
if self.output_path is not None:
|
|
96
|
-
self.output_path = str(Path(self.output_path).expanduser())
|
|
105
|
+
self.output_path = str(Path(self.output_path).expanduser().resolve())
|
|
97
106
|
output_dir = Path(self.output_path).parent
|
|
98
107
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
99
108
|
|
|
@@ -154,7 +163,7 @@ class CaptionConfig:
|
|
|
154
163
|
if not self.input_path:
|
|
155
164
|
raise ValueError("input_path is required but not set in CaptionConfig")
|
|
156
165
|
|
|
157
|
-
input_file = Path(self.input_path).expanduser()
|
|
166
|
+
input_file = Path(self.input_path).expanduser().resolve()
|
|
158
167
|
if not input_file.exists():
|
|
159
168
|
raise FileNotFoundError(
|
|
160
169
|
f"Input caption file does not exist: '{input_file}'. " "Please check the path and try again."
|
|
@@ -164,15 +173,20 @@ class CaptionConfig:
|
|
|
164
173
|
f"Input caption path is not a file: '{input_file}'. " "Expected a valid caption file path."
|
|
165
174
|
)
|
|
166
175
|
|
|
167
|
-
def check_sanity(self) ->
|
|
168
|
-
"""Perform sanity checks on the configuration.
|
|
169
|
-
|
|
176
|
+
def check_sanity(self) -> None:
|
|
177
|
+
"""Perform sanity checks on the configuration.
|
|
178
|
+
|
|
179
|
+
Raises:
|
|
180
|
+
ValueError: If input path is not provided or does not exist.
|
|
181
|
+
"""
|
|
182
|
+
if not self.is_input_path_existed():
|
|
183
|
+
raise ValueError("Input caption path must be provided and exist.")
|
|
170
184
|
|
|
171
185
|
def is_input_path_existed(self) -> bool:
|
|
172
186
|
"""Check if input caption path is provided and exists."""
|
|
173
187
|
if self.input_path is None:
|
|
174
188
|
return False
|
|
175
189
|
|
|
176
|
-
input_file = Path(self.input_path).expanduser()
|
|
190
|
+
input_file = Path(self.input_path).expanduser().resolve()
|
|
177
191
|
self.input_path = str(input_file)
|
|
178
192
|
return input_file.exists() and input_file.is_file()
|
lattifai/config/client.py
CHANGED
|
@@ -31,6 +31,13 @@ class ClientConfig:
|
|
|
31
31
|
When True, prints detailed timing information for various stages of the process.
|
|
32
32
|
"""
|
|
33
33
|
|
|
34
|
+
# Client identification for usage tracking
|
|
35
|
+
client_name: Optional[str] = field(default="python-sdk")
|
|
36
|
+
"""Client identifier for usage tracking (e.g., 'python-sdk', 'claude-plugin')."""
|
|
37
|
+
|
|
38
|
+
client_version: Optional[str] = field(default=None)
|
|
39
|
+
"""Client version for usage tracking. If None, uses lattifai package version."""
|
|
40
|
+
|
|
34
41
|
def __post_init__(self):
|
|
35
42
|
"""Validate and auto-populate configuration after initialization."""
|
|
36
43
|
|
|
@@ -44,6 +51,15 @@ class ClientConfig:
|
|
|
44
51
|
if self.api_key is None:
|
|
45
52
|
object.__setattr__(self, "api_key", os.environ.get("LATTIFAI_API_KEY"))
|
|
46
53
|
|
|
54
|
+
# Auto-load client version from package if not provided
|
|
55
|
+
if self.client_version is None:
|
|
56
|
+
try:
|
|
57
|
+
from importlib.metadata import version
|
|
58
|
+
|
|
59
|
+
object.__setattr__(self, "client_version", version("lattifai"))
|
|
60
|
+
except Exception:
|
|
61
|
+
object.__setattr__(self, "client_version", "unknown")
|
|
62
|
+
|
|
47
63
|
# Validate API parameters
|
|
48
64
|
if self.timeout <= 0:
|
|
49
65
|
raise ValueError("timeout must be greater than 0")
|