lattifai 0.4.5__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/__init__.py +61 -47
- lattifai/alignment/__init__.py +6 -0
- lattifai/alignment/lattice1_aligner.py +119 -0
- lattifai/alignment/lattice1_worker.py +185 -0
- lattifai/{tokenizer → alignment}/phonemizer.py +4 -4
- lattifai/alignment/segmenter.py +166 -0
- lattifai/{tokenizer → alignment}/tokenizer.py +244 -169
- lattifai/audio2.py +211 -0
- lattifai/caption/__init__.py +20 -0
- lattifai/caption/caption.py +1275 -0
- lattifai/{io → caption}/gemini_reader.py +30 -30
- lattifai/{io → caption}/gemini_writer.py +17 -17
- lattifai/{io → caption}/supervision.py +4 -3
- lattifai/caption/text_parser.py +145 -0
- lattifai/cli/__init__.py +17 -0
- lattifai/cli/alignment.py +153 -0
- lattifai/cli/caption.py +204 -0
- lattifai/cli/server.py +19 -0
- lattifai/cli/transcribe.py +197 -0
- lattifai/cli/youtube.py +128 -0
- lattifai/client.py +460 -251
- lattifai/config/__init__.py +20 -0
- lattifai/config/alignment.py +73 -0
- lattifai/config/caption.py +178 -0
- lattifai/config/client.py +46 -0
- lattifai/config/diarization.py +67 -0
- lattifai/config/media.py +335 -0
- lattifai/config/transcription.py +84 -0
- lattifai/diarization/__init__.py +5 -0
- lattifai/diarization/lattifai.py +89 -0
- lattifai/errors.py +98 -91
- lattifai/logging.py +116 -0
- lattifai/mixin.py +552 -0
- lattifai/server/app.py +420 -0
- lattifai/transcription/__init__.py +76 -0
- lattifai/transcription/base.py +108 -0
- lattifai/transcription/gemini.py +219 -0
- lattifai/transcription/lattifai.py +103 -0
- lattifai/{workflows → transcription}/prompts/__init__.py +4 -4
- lattifai/types.py +30 -0
- lattifai/utils.py +16 -44
- lattifai/workflow/__init__.py +22 -0
- lattifai/workflow/agents.py +6 -0
- lattifai/{workflows → workflow}/base.py +22 -22
- lattifai/{workflows → workflow}/file_manager.py +239 -215
- lattifai/workflow/youtube.py +564 -0
- lattifai-1.0.0.dist-info/METADATA +736 -0
- lattifai-1.0.0.dist-info/RECORD +52 -0
- {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/WHEEL +1 -1
- lattifai-1.0.0.dist-info/entry_points.txt +13 -0
- {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/licenses/LICENSE +1 -1
- lattifai/base_client.py +0 -126
- lattifai/bin/__init__.py +0 -3
- lattifai/bin/agent.py +0 -325
- lattifai/bin/align.py +0 -296
- lattifai/bin/cli_base.py +0 -25
- lattifai/bin/subtitle.py +0 -210
- lattifai/io/__init__.py +0 -42
- lattifai/io/reader.py +0 -85
- lattifai/io/text_parser.py +0 -75
- lattifai/io/utils.py +0 -15
- lattifai/io/writer.py +0 -90
- lattifai/tokenizer/__init__.py +0 -3
- lattifai/workers/__init__.py +0 -3
- lattifai/workers/lattice1_alpha.py +0 -284
- lattifai/workflows/__init__.py +0 -34
- lattifai/workflows/agents.py +0 -10
- lattifai/workflows/gemini.py +0 -167
- lattifai/workflows/prompts/README.md +0 -22
- lattifai/workflows/prompts/gemini/README.md +0 -24
- lattifai/workflows/prompts/gemini/transcription_gem.txt +0 -81
- lattifai/workflows/youtube.py +0 -931
- lattifai-0.4.5.dist-info/METADATA +0 -808
- lattifai-0.4.5.dist-info/RECORD +0 -39
- lattifai-0.4.5.dist-info/entry_points.txt +0 -3
- {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/top_level.txt +0 -0
lattifai/cli/caption.py
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
"""Caption CLI entry point with nemo_run."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
import nemo_run as run
|
|
6
|
+
from lhotse.utils import Pathlike
|
|
7
|
+
from typing_extensions import Annotated
|
|
8
|
+
|
|
9
|
+
from lattifai.config import CaptionConfig
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@run.cli.entrypoint(name="convert", namespace="caption")
|
|
13
|
+
def convert(
|
|
14
|
+
input_path: Pathlike,
|
|
15
|
+
output_path: Pathlike,
|
|
16
|
+
include_speaker_in_text: bool = True,
|
|
17
|
+
normalize_text: bool = False,
|
|
18
|
+
):
|
|
19
|
+
"""
|
|
20
|
+
Convert caption file to another format.
|
|
21
|
+
|
|
22
|
+
This command reads a caption file from one format and writes it to another format,
|
|
23
|
+
preserving all timing information, text content, and speaker labels (if present).
|
|
24
|
+
Supports common caption formats including SRT, VTT, JSON, and Praat TextGrid.
|
|
25
|
+
|
|
26
|
+
Shortcut: invoking ``laisub-convert`` is equivalent to running ``lai caption convert``.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
input_path: Path to input caption file (supports SRT, VTT, JSON, TextGrid formats)
|
|
30
|
+
output_path: Path to output caption file (format determined by file extension)
|
|
31
|
+
include_speaker_in_text: Preserve speaker labels in caption text content.
|
|
32
|
+
normalize_text: Whether to normalize caption text during conversion.
|
|
33
|
+
This applies text cleaning such as removing HTML tags, decoding entities,
|
|
34
|
+
collapsing whitespace, and standardizing punctuation.
|
|
35
|
+
|
|
36
|
+
Examples:
|
|
37
|
+
# Basic format conversion (positional arguments)
|
|
38
|
+
lai caption convert input.srt output.vtt
|
|
39
|
+
|
|
40
|
+
# Convert with text normalization
|
|
41
|
+
lai caption convert input.srt output.json normalize_text=true
|
|
42
|
+
|
|
43
|
+
# Mixing positional and keyword arguments
|
|
44
|
+
lai caption convert input.srt output.vtt \\
|
|
45
|
+
include_speaker_in_text=false \\
|
|
46
|
+
normalize_text=true
|
|
47
|
+
|
|
48
|
+
# Using keyword arguments (traditional syntax)
|
|
49
|
+
lai caption convert \\
|
|
50
|
+
input_path=input.srt \\
|
|
51
|
+
output_path=output.TextGrid
|
|
52
|
+
"""
|
|
53
|
+
from lattifai.caption import Caption
|
|
54
|
+
|
|
55
|
+
caption = Caption.read(input_path, normalize_text=normalize_text)
|
|
56
|
+
caption.write(output_path, include_speaker_in_text=include_speaker_in_text)
|
|
57
|
+
|
|
58
|
+
print(f"✅ Converted {input_path} -> {output_path}")
|
|
59
|
+
return output_path
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@run.cli.entrypoint(name="normalize", namespace="caption")
|
|
63
|
+
def normalize(
|
|
64
|
+
input_path: Pathlike,
|
|
65
|
+
output_path: Pathlike,
|
|
66
|
+
caption: Annotated[Optional[CaptionConfig], run.Config[CaptionConfig]] = None,
|
|
67
|
+
):
|
|
68
|
+
"""
|
|
69
|
+
Normalize caption text by cleaning HTML entities and whitespace.
|
|
70
|
+
|
|
71
|
+
This command reads a caption file and normalizes all text content by applying
|
|
72
|
+
the following transformations:
|
|
73
|
+
- Decode common HTML entities (&, <, >, ", ', )
|
|
74
|
+
- Remove HTML tags (e.g., <i>, <font>, <b>, <br>)
|
|
75
|
+
- Collapse multiple whitespace characters into single spaces
|
|
76
|
+
- Convert curly apostrophes to straight ones in contractions
|
|
77
|
+
- Strip leading and trailing whitespace from each segment
|
|
78
|
+
|
|
79
|
+
Shortcut: invoking ``laisub-normalize`` is equivalent to running ``lai caption normalize``.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
input_path: Path to input caption file to normalize
|
|
83
|
+
output_path: Path to output caption file (defaults to overwriting input file)
|
|
84
|
+
caption: Caption configuration for text normalization.
|
|
85
|
+
Fields: input_format, output_format, normalize_text (automatically enabled),
|
|
86
|
+
encoding
|
|
87
|
+
|
|
88
|
+
Examples:
|
|
89
|
+
# Normalize and save to new file (positional arguments)
|
|
90
|
+
lai caption normalize input.srt output.srt
|
|
91
|
+
|
|
92
|
+
# Normalize with format conversion
|
|
93
|
+
lai caption normalize input.vtt output.srt
|
|
94
|
+
|
|
95
|
+
# Normalize with custom caption config
|
|
96
|
+
lai caption normalize input.srt output.srt \\
|
|
97
|
+
caption.encoding=utf-8
|
|
98
|
+
|
|
99
|
+
# Using keyword arguments (traditional syntax)
|
|
100
|
+
lai caption normalize \\
|
|
101
|
+
input_path=input.srt \\
|
|
102
|
+
output_path=output.srt
|
|
103
|
+
"""
|
|
104
|
+
from pathlib import Path
|
|
105
|
+
|
|
106
|
+
from lattifai.caption import Caption
|
|
107
|
+
|
|
108
|
+
input_path = Path(input_path).expanduser()
|
|
109
|
+
output_path = Path(output_path).expanduser()
|
|
110
|
+
|
|
111
|
+
caption_obj = Caption.read(input_path, normalize_text=True)
|
|
112
|
+
caption_obj.write(output_path, include_speaker_in_text=True)
|
|
113
|
+
|
|
114
|
+
if output_path == input_path:
|
|
115
|
+
print(f"✅ Normalized {input_path} (in-place)")
|
|
116
|
+
else:
|
|
117
|
+
print(f"✅ Normalized {input_path} -> {output_path}")
|
|
118
|
+
|
|
119
|
+
return output_path
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
@run.cli.entrypoint(name="shift", namespace="caption")
|
|
123
|
+
def shift(
|
|
124
|
+
input_path: Pathlike,
|
|
125
|
+
output_path: Pathlike,
|
|
126
|
+
seconds: float,
|
|
127
|
+
caption: Annotated[Optional[CaptionConfig], run.Config[CaptionConfig]] = None,
|
|
128
|
+
):
|
|
129
|
+
"""
|
|
130
|
+
Shift caption timestamps by a specified number of seconds.
|
|
131
|
+
|
|
132
|
+
This command reads a caption file and adjusts all timestamps by adding or
|
|
133
|
+
subtracting a specified offset. Use positive values to delay captions and
|
|
134
|
+
negative values to make them appear earlier.
|
|
135
|
+
|
|
136
|
+
Shortcut: invoking ``laisub-shift`` is equivalent to running ``lai caption shift``.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
input_path: Path to input caption file
|
|
140
|
+
output_path: Path to output caption file (can be same as input for in-place modification)
|
|
141
|
+
seconds: Number of seconds to shift timestamps. Positive values delay captions,
|
|
142
|
+
negative values advance them earlier.
|
|
143
|
+
caption: Caption configuration for reading/writing.
|
|
144
|
+
Fields: input_format, output_format, encoding
|
|
145
|
+
|
|
146
|
+
Examples:
|
|
147
|
+
# Delay captions by 2 seconds (positional arguments)
|
|
148
|
+
lai caption shift input.srt output.srt 2.0
|
|
149
|
+
|
|
150
|
+
# Make captions appear 1.5 seconds earlier
|
|
151
|
+
lai caption shift input.srt output.srt -1.5
|
|
152
|
+
|
|
153
|
+
# Shift and convert format
|
|
154
|
+
lai caption shift input.vtt output.srt seconds=0.5
|
|
155
|
+
|
|
156
|
+
# Using keyword arguments (traditional syntax)
|
|
157
|
+
lai caption shift \\
|
|
158
|
+
input_path=input.srt \\
|
|
159
|
+
output_path=output.srt \\
|
|
160
|
+
seconds=3.0
|
|
161
|
+
"""
|
|
162
|
+
from pathlib import Path
|
|
163
|
+
|
|
164
|
+
from lattifai.caption import Caption
|
|
165
|
+
|
|
166
|
+
input_path = Path(input_path).expanduser()
|
|
167
|
+
output_path = Path(output_path).expanduser()
|
|
168
|
+
|
|
169
|
+
# Read captions
|
|
170
|
+
caption_obj = Caption.read(input_path)
|
|
171
|
+
|
|
172
|
+
# Shift timestamps
|
|
173
|
+
shifted_caption = caption_obj.shift_time(seconds)
|
|
174
|
+
|
|
175
|
+
# Write shifted captions
|
|
176
|
+
shifted_caption.write(output_path, include_speaker_in_text=True)
|
|
177
|
+
|
|
178
|
+
if seconds >= 0:
|
|
179
|
+
direction = f"delayed by {seconds}s"
|
|
180
|
+
else:
|
|
181
|
+
direction = f"advanced by {abs(seconds)}s"
|
|
182
|
+
|
|
183
|
+
if output_path == input_path:
|
|
184
|
+
print(f"✅ Shifted timestamps {direction} in {input_path} (in-place)")
|
|
185
|
+
else:
|
|
186
|
+
print(f"✅ Shifted timestamps {direction}: {input_path} -> {output_path}")
|
|
187
|
+
|
|
188
|
+
return output_path
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def main_convert():
|
|
192
|
+
run.cli.main(convert)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def main_normalize():
|
|
196
|
+
run.cli.main(normalize)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def main_shift():
|
|
200
|
+
run.cli.main(shift)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
if __name__ == "__main__":
|
|
204
|
+
main_convert()
|
lattifai/cli/server.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import colorful
|
|
4
|
+
import uvicorn
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def main():
|
|
8
|
+
"""Launch the LattifAI Web Interface."""
|
|
9
|
+
print(colorful.bold_green("🚀 Launching LattifAI Web Interface..."))
|
|
10
|
+
print(colorful.cyan("See http://localhost:8001"))
|
|
11
|
+
|
|
12
|
+
# Ensure the directory contains the app
|
|
13
|
+
# We might need to adjust python path or just rely on installed package
|
|
14
|
+
|
|
15
|
+
uvicorn.run("lattifai.server.app:app", host="0.0.0.0", port=8001, reload=True, log_level="info")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
if __name__ == "__main__":
|
|
19
|
+
main()
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""Transcription CLI entry point with nemo_run."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
import nemo_run as run
|
|
6
|
+
from lhotse.utils import Pathlike
|
|
7
|
+
from typing_extensions import Annotated
|
|
8
|
+
|
|
9
|
+
from lattifai.audio2 import AudioLoader, ChannelSelectorType
|
|
10
|
+
from lattifai.cli.alignment import align as alignment_align
|
|
11
|
+
from lattifai.config import (
|
|
12
|
+
AlignmentConfig,
|
|
13
|
+
CaptionConfig,
|
|
14
|
+
ClientConfig,
|
|
15
|
+
DiarizationConfig,
|
|
16
|
+
MediaConfig,
|
|
17
|
+
TranscriptionConfig,
|
|
18
|
+
)
|
|
19
|
+
from lattifai.utils import _resolve_model_path
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@run.cli.entrypoint(name="run", namespace="transcribe")
|
|
23
|
+
def transcribe(
|
|
24
|
+
input: Optional[str] = None,
|
|
25
|
+
output_caption: Optional[str] = None,
|
|
26
|
+
output_dir: Optional[Pathlike] = None,
|
|
27
|
+
media_format: str = "mp3",
|
|
28
|
+
channel_selector: Optional[ChannelSelectorType] = "average",
|
|
29
|
+
transcription: Annotated[Optional[TranscriptionConfig], run.Config[TranscriptionConfig]] = None,
|
|
30
|
+
):
|
|
31
|
+
"""
|
|
32
|
+
Transcribe audio/video file or YouTube URL to caption.
|
|
33
|
+
|
|
34
|
+
This command performs automatic speech recognition (ASR) on audio/video files
|
|
35
|
+
or YouTube videos, generating timestamped transcriptions in various caption formats.
|
|
36
|
+
|
|
37
|
+
Shortcut: invoking ``lai-transcribe`` is equivalent to running ``lai transcribe run``.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
input: Path to input audio/video file or YouTube URL (can be provided as positional argument)
|
|
41
|
+
output_caption: Path for output caption file (can be provided as positional argument)
|
|
42
|
+
output_dir: Directory for output files when using YouTube URL
|
|
43
|
+
media_format: Media format for YouTube downloads (default: mp3)
|
|
44
|
+
channel_selector: Audio channel selection strategy (default: average)
|
|
45
|
+
Options: average, left, right, or an integer channel index.
|
|
46
|
+
Note: Ignored when input is a URL and Gemini transcriber is used.
|
|
47
|
+
transcription: Transcription service configuration.
|
|
48
|
+
Fields: model_name, device, language, gemini_api_key
|
|
49
|
+
|
|
50
|
+
Examples:
|
|
51
|
+
# Transcribe local file with positional arguments
|
|
52
|
+
lai transcribe run audio.wav output.srt
|
|
53
|
+
|
|
54
|
+
# Transcribe YouTube video
|
|
55
|
+
lai transcribe run "https://www.youtube.com/watch?v=VIDEO_ID" ./output
|
|
56
|
+
|
|
57
|
+
# Using specific transcription model
|
|
58
|
+
lai transcribe run audio.mp4 output.ass \\
|
|
59
|
+
transcription.model_name=nvidia/parakeet-tdt-0.6b-v3
|
|
60
|
+
|
|
61
|
+
# Using Gemini transcription (requires API key)
|
|
62
|
+
lai transcribe run audio.wav output.srt \\
|
|
63
|
+
transcription.model_name=gemini-2.5-pro \\
|
|
64
|
+
transcription.gemini_api_key=YOUR_KEY
|
|
65
|
+
|
|
66
|
+
# Specify language for transcription
|
|
67
|
+
lai transcribe run audio.wav output.srt \\
|
|
68
|
+
transcription.language=zh
|
|
69
|
+
|
|
70
|
+
# Full configuration with keyword arguments
|
|
71
|
+
lai transcribe run \\
|
|
72
|
+
input=audio.wav \\
|
|
73
|
+
output_caption=output.srt \\
|
|
74
|
+
transcription.device=cuda \\
|
|
75
|
+
transcription.model_name=iic/SenseVoiceSmall
|
|
76
|
+
"""
|
|
77
|
+
import asyncio
|
|
78
|
+
from pathlib import Path
|
|
79
|
+
|
|
80
|
+
import colorful
|
|
81
|
+
|
|
82
|
+
from lattifai.transcription import create_transcriber
|
|
83
|
+
|
|
84
|
+
# Initialize transcription config with defaults
|
|
85
|
+
transcription_config = transcription or TranscriptionConfig()
|
|
86
|
+
|
|
87
|
+
# Validate input is required
|
|
88
|
+
if not input:
|
|
89
|
+
raise ValueError("Input is required. Provide input as positional argument (file path or URL).")
|
|
90
|
+
|
|
91
|
+
# Detect if input is a URL
|
|
92
|
+
is_url = input.startswith(("http://", "https://"))
|
|
93
|
+
|
|
94
|
+
# Prepare output paths
|
|
95
|
+
if is_url:
|
|
96
|
+
# For URLs, use output_dir
|
|
97
|
+
if output_dir:
|
|
98
|
+
output_path = Path(str(output_dir)).expanduser()
|
|
99
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
100
|
+
else:
|
|
101
|
+
output_path = Path.cwd()
|
|
102
|
+
else:
|
|
103
|
+
# For files, use input path directory
|
|
104
|
+
input_path = Path(str(input))
|
|
105
|
+
output_path = input_path.parent
|
|
106
|
+
|
|
107
|
+
# Create transcriber
|
|
108
|
+
if not transcription_config.lattice_model_path:
|
|
109
|
+
transcription_config.lattice_model_path = _resolve_model_path("Lattifai/Lattice-1")
|
|
110
|
+
transcriber = create_transcriber(transcription_config=transcription_config)
|
|
111
|
+
|
|
112
|
+
print(colorful.cyan(f"🎤 Starting transcription with {transcriber.name}..."))
|
|
113
|
+
print(colorful.cyan(f" Input: {input}"))
|
|
114
|
+
|
|
115
|
+
# Perform transcription
|
|
116
|
+
if is_url and transcriber.supports_url:
|
|
117
|
+
# Check if transcriber supports URL directly
|
|
118
|
+
print(colorful.cyan(" Transcribing from URL directly..."))
|
|
119
|
+
transcript = asyncio.run(transcriber.transcribe(input))
|
|
120
|
+
else:
|
|
121
|
+
if is_url:
|
|
122
|
+
# Download media first, then transcribe
|
|
123
|
+
print(colorful.cyan(" Downloading media from URL..."))
|
|
124
|
+
from lattifai.workflow.youtube import YouTubeDownloader
|
|
125
|
+
|
|
126
|
+
downloader = YouTubeDownloader()
|
|
127
|
+
input_path = asyncio.run(
|
|
128
|
+
downloader.download_media(
|
|
129
|
+
url=input,
|
|
130
|
+
output_dir=str(output_path),
|
|
131
|
+
media_format=media_format,
|
|
132
|
+
force_overwrite=False,
|
|
133
|
+
)
|
|
134
|
+
)
|
|
135
|
+
print(colorful.cyan(f" Media downloaded to: {input_path}"))
|
|
136
|
+
else:
|
|
137
|
+
input_path = Path(str(input))
|
|
138
|
+
|
|
139
|
+
print(colorful.cyan(" Loading audio..."))
|
|
140
|
+
# For files, load audio first
|
|
141
|
+
audio_loader = AudioLoader(device=transcription_config.device)
|
|
142
|
+
media_audio = audio_loader(input_path, channel_selector=channel_selector)
|
|
143
|
+
transcript = asyncio.run(transcriber.transcribe(media_audio))
|
|
144
|
+
|
|
145
|
+
# Determine output caption path
|
|
146
|
+
if output_caption:
|
|
147
|
+
final_output = Path(str(output_caption))
|
|
148
|
+
final_output.parent.mkdir(parents=True, exist_ok=True)
|
|
149
|
+
else:
|
|
150
|
+
if is_url:
|
|
151
|
+
# For URLs, generate output filename based on transcriber
|
|
152
|
+
output_format = transcriber.file_suffix.lstrip(".")
|
|
153
|
+
final_output = output_path / f"youtube_LattifAI_{transcriber.name}.{output_format}"
|
|
154
|
+
else:
|
|
155
|
+
# For files, use input filename with suffix
|
|
156
|
+
final_output = Path(str(input)).with_suffix(".LattifAI.srt")
|
|
157
|
+
|
|
158
|
+
print(colorful.cyan(f" Output: {final_output}"))
|
|
159
|
+
|
|
160
|
+
# Write output
|
|
161
|
+
transcriber.write(transcript, final_output, encoding="utf-8", cache_audio_events=False)
|
|
162
|
+
|
|
163
|
+
print(colorful.green(f"🎉 Transcription completed: {final_output}"))
|
|
164
|
+
|
|
165
|
+
return transcript
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
@run.cli.entrypoint(name="align", namespace="transcribe")
|
|
169
|
+
def transcribe_align(
|
|
170
|
+
input_media: Optional[str] = None,
|
|
171
|
+
output_caption: Optional[str] = None,
|
|
172
|
+
media: Annotated[Optional[MediaConfig], run.Config[MediaConfig]] = None,
|
|
173
|
+
caption: Annotated[Optional[CaptionConfig], run.Config[CaptionConfig]] = None,
|
|
174
|
+
client: Annotated[Optional[ClientConfig], run.Config[ClientConfig]] = None,
|
|
175
|
+
alignment: Annotated[Optional[AlignmentConfig], run.Config[AlignmentConfig]] = None,
|
|
176
|
+
transcription: Annotated[Optional[TranscriptionConfig], run.Config[TranscriptionConfig]] = None,
|
|
177
|
+
diarization: Annotated[Optional[DiarizationConfig], run.Config[DiarizationConfig]] = None,
|
|
178
|
+
):
|
|
179
|
+
return alignment_align(
|
|
180
|
+
input_media=input_media,
|
|
181
|
+
output_caption=output_caption,
|
|
182
|
+
media=media,
|
|
183
|
+
caption=caption,
|
|
184
|
+
client=client,
|
|
185
|
+
alignment=alignment,
|
|
186
|
+
transcription=transcription,
|
|
187
|
+
diarization=diarization,
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def main():
|
|
192
|
+
"""Entry point for lai-transcribe command."""
|
|
193
|
+
run.cli.main(transcribe)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
if __name__ == "__main__":
|
|
197
|
+
main()
|
lattifai/cli/youtube.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""YouTube workflow CLI entry point with nemo_run."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
import nemo_run as run
|
|
6
|
+
from typing_extensions import Annotated
|
|
7
|
+
|
|
8
|
+
from lattifai.client import LattifAI
|
|
9
|
+
from lattifai.config import (
|
|
10
|
+
AlignmentConfig,
|
|
11
|
+
CaptionConfig,
|
|
12
|
+
ClientConfig,
|
|
13
|
+
DiarizationConfig,
|
|
14
|
+
MediaConfig,
|
|
15
|
+
TranscriptionConfig,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@run.cli.entrypoint(name="youtube", namespace="alignment")
|
|
20
|
+
def youtube(
|
|
21
|
+
yt_url: Optional[str] = None,
|
|
22
|
+
media: Annotated[Optional[MediaConfig], run.Config[MediaConfig]] = None,
|
|
23
|
+
client: Annotated[Optional[ClientConfig], run.Config[ClientConfig]] = None,
|
|
24
|
+
alignment: Annotated[Optional[AlignmentConfig], run.Config[AlignmentConfig]] = None,
|
|
25
|
+
caption: Annotated[Optional[CaptionConfig], run.Config[CaptionConfig]] = None,
|
|
26
|
+
transcription: Annotated[Optional[TranscriptionConfig], run.Config[TranscriptionConfig]] = None,
|
|
27
|
+
diarization: Annotated[Optional[DiarizationConfig], run.Config[DiarizationConfig]] = None,
|
|
28
|
+
):
|
|
29
|
+
"""
|
|
30
|
+
Download media from YouTube (when needed) and align captions.
|
|
31
|
+
|
|
32
|
+
This command provides a convenient workflow for aligning captions with YouTube videos.
|
|
33
|
+
It can automatically download media from YouTube URLs and optionally transcribe audio
|
|
34
|
+
using Gemini or download available captions from YouTube.
|
|
35
|
+
|
|
36
|
+
When a YouTube URL is provided:
|
|
37
|
+
1. Downloads media in the specified format (audio or video)
|
|
38
|
+
2. Optionally transcribes audio with Gemini OR downloads YouTube captions
|
|
39
|
+
3. Performs forced alignment with the provided or generated captions
|
|
40
|
+
|
|
41
|
+
Shortcut: invoking ``lai-youtube`` is equivalent to running ``lai alignment youtube``.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
yt_url: YouTube video URL (can be provided as positional argument)
|
|
45
|
+
media: Media configuration for controlling formats and output directories.
|
|
46
|
+
Fields: input_path (YouTube URL), output_dir, output_format, force_overwrite
|
|
47
|
+
client: API client configuration.
|
|
48
|
+
Fields: api_key, timeout, max_retries
|
|
49
|
+
alignment: Alignment configuration (model selection and inference settings).
|
|
50
|
+
Fields: model_name, device, batch_size
|
|
51
|
+
caption: Caption configuration for reading/writing caption files.
|
|
52
|
+
Fields: output_format, output_path, normalize_text,
|
|
53
|
+
split_sentence, word_level, encoding
|
|
54
|
+
transcription: Transcription service configuration (enables Gemini transcription).
|
|
55
|
+
Fields: gemini_api_key, model_name, language, device
|
|
56
|
+
diarization: Speaker diarization configuration.
|
|
57
|
+
Fields: enabled, num_speakers, min_speakers, max_speakers, device
|
|
58
|
+
|
|
59
|
+
Examples:
|
|
60
|
+
# Download from YouTube and align (positional argument)
|
|
61
|
+
lai alignment youtube "https://www.youtube.com/watch?v=VIDEO_ID"
|
|
62
|
+
|
|
63
|
+
# With custom output directory and format
|
|
64
|
+
lai alignment youtube "https://www.youtube.com/watch?v=VIDEO_ID" \\
|
|
65
|
+
media.output_dir=/tmp/youtube \\
|
|
66
|
+
media.output_format=mp3
|
|
67
|
+
|
|
68
|
+
# Full configuration with smart splitting and word-level alignment
|
|
69
|
+
lai alignment youtube "https://www.youtube.com/watch?v=VIDEO_ID" \\
|
|
70
|
+
caption.output_path=aligned.srt \\
|
|
71
|
+
caption.split_sentence=true \\
|
|
72
|
+
caption.word_level=true \\
|
|
73
|
+
alignment.device=cuda
|
|
74
|
+
|
|
75
|
+
# Use Gemini transcription (requires API key)
|
|
76
|
+
lai alignment youtube "https://www.youtube.com/watch?v=VIDEO_ID" \\
|
|
77
|
+
transcription.gemini_api_key=YOUR_KEY \\
|
|
78
|
+
transcription.model_name=gemini-2.0-flash
|
|
79
|
+
|
|
80
|
+
# Using keyword argument (traditional syntax)
|
|
81
|
+
lai alignment youtube \\
|
|
82
|
+
yt_url="https://www.youtube.com/watch?v=VIDEO_ID" \\
|
|
83
|
+
alignment.device=mps
|
|
84
|
+
"""
|
|
85
|
+
# Initialize configs with defaults
|
|
86
|
+
media_config = media or MediaConfig()
|
|
87
|
+
caption_config = caption or CaptionConfig()
|
|
88
|
+
|
|
89
|
+
# Validate URL input: require exactly one of yt_url or media.input_path
|
|
90
|
+
if yt_url and media_config.input_path:
|
|
91
|
+
raise ValueError(
|
|
92
|
+
"Cannot specify both positional yt_url and media.input_path. "
|
|
93
|
+
"Use either positional argument or config, not both."
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
if not yt_url and not media_config.input_path:
|
|
97
|
+
raise ValueError("YouTube URL is required. Provide either positional yt_url or media.input_path parameter.")
|
|
98
|
+
|
|
99
|
+
# Assign yt_url to media_config.input_path if provided
|
|
100
|
+
if yt_url:
|
|
101
|
+
media_config.set_input_path(yt_url)
|
|
102
|
+
|
|
103
|
+
# Create LattifAI client with all configurations
|
|
104
|
+
lattifai_client = LattifAI(
|
|
105
|
+
client_config=client,
|
|
106
|
+
alignment_config=alignment,
|
|
107
|
+
caption_config=caption_config,
|
|
108
|
+
transcription_config=transcription,
|
|
109
|
+
diarization_config=diarization,
|
|
110
|
+
)
|
|
111
|
+
# Call the client's youtube method
|
|
112
|
+
return lattifai_client.youtube(
|
|
113
|
+
url=media_config.input_path,
|
|
114
|
+
output_dir=media_config.output_dir,
|
|
115
|
+
output_caption_path=caption_config.output_path,
|
|
116
|
+
media_format=media_config.normalize_format() if media_config.output_format else None,
|
|
117
|
+
force_overwrite=media_config.force_overwrite,
|
|
118
|
+
split_sentence=caption_config.split_sentence,
|
|
119
|
+
channel_selector=media_config.channel_selector,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def main():
|
|
124
|
+
run.cli.main(youtube)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
if __name__ == "__main__":
|
|
128
|
+
main()
|