lattifai 0.4.6__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/__init__.py +42 -27
- lattifai/alignment/__init__.py +6 -0
- lattifai/alignment/lattice1_aligner.py +119 -0
- lattifai/{workers/lattice1_alpha.py → alignment/lattice1_worker.py} +33 -132
- lattifai/{tokenizer → alignment}/phonemizer.py +1 -1
- lattifai/alignment/segmenter.py +166 -0
- lattifai/{tokenizer → alignment}/tokenizer.py +186 -112
- lattifai/audio2.py +211 -0
- lattifai/caption/__init__.py +20 -0
- lattifai/caption/caption.py +1275 -0
- lattifai/{io → caption}/supervision.py +1 -0
- lattifai/{io → caption}/text_parser.py +53 -10
- lattifai/cli/__init__.py +17 -0
- lattifai/cli/alignment.py +153 -0
- lattifai/cli/caption.py +204 -0
- lattifai/cli/server.py +19 -0
- lattifai/cli/transcribe.py +197 -0
- lattifai/cli/youtube.py +128 -0
- lattifai/client.py +455 -246
- lattifai/config/__init__.py +20 -0
- lattifai/config/alignment.py +73 -0
- lattifai/config/caption.py +178 -0
- lattifai/config/client.py +46 -0
- lattifai/config/diarization.py +67 -0
- lattifai/config/media.py +335 -0
- lattifai/config/transcription.py +84 -0
- lattifai/diarization/__init__.py +5 -0
- lattifai/diarization/lattifai.py +89 -0
- lattifai/errors.py +41 -34
- lattifai/logging.py +116 -0
- lattifai/mixin.py +552 -0
- lattifai/server/app.py +420 -0
- lattifai/transcription/__init__.py +76 -0
- lattifai/transcription/base.py +108 -0
- lattifai/transcription/gemini.py +219 -0
- lattifai/transcription/lattifai.py +103 -0
- lattifai/types.py +30 -0
- lattifai/utils.py +3 -31
- lattifai/workflow/__init__.py +22 -0
- lattifai/workflow/agents.py +6 -0
- lattifai/{workflows → workflow}/file_manager.py +81 -57
- lattifai/workflow/youtube.py +564 -0
- lattifai-1.0.0.dist-info/METADATA +736 -0
- lattifai-1.0.0.dist-info/RECORD +52 -0
- {lattifai-0.4.6.dist-info → lattifai-1.0.0.dist-info}/WHEEL +1 -1
- lattifai-1.0.0.dist-info/entry_points.txt +13 -0
- lattifai/base_client.py +0 -126
- lattifai/bin/__init__.py +0 -3
- lattifai/bin/agent.py +0 -324
- lattifai/bin/align.py +0 -295
- lattifai/bin/cli_base.py +0 -25
- lattifai/bin/subtitle.py +0 -210
- lattifai/io/__init__.py +0 -43
- lattifai/io/reader.py +0 -86
- lattifai/io/utils.py +0 -15
- lattifai/io/writer.py +0 -102
- lattifai/tokenizer/__init__.py +0 -3
- lattifai/workers/__init__.py +0 -3
- lattifai/workflows/__init__.py +0 -34
- lattifai/workflows/agents.py +0 -12
- lattifai/workflows/gemini.py +0 -167
- lattifai/workflows/prompts/README.md +0 -22
- lattifai/workflows/prompts/gemini/README.md +0 -24
- lattifai/workflows/prompts/gemini/transcription_gem.txt +0 -81
- lattifai/workflows/youtube.py +0 -931
- lattifai-0.4.6.dist-info/METADATA +0 -806
- lattifai-0.4.6.dist-info/RECORD +0 -39
- lattifai-0.4.6.dist-info/entry_points.txt +0 -3
- /lattifai/{io → caption}/gemini_reader.py +0 -0
- /lattifai/{io → caption}/gemini_writer.py +0 -0
- /lattifai/{workflows → transcription}/prompts/__init__.py +0 -0
- /lattifai/{workflows → workflow}/base.py +0 -0
- {lattifai-0.4.6.dist-info → lattifai-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {lattifai-0.4.6.dist-info → lattifai-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""Transcription CLI entry point with nemo_run."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
import nemo_run as run
|
|
6
|
+
from lhotse.utils import Pathlike
|
|
7
|
+
from typing_extensions import Annotated
|
|
8
|
+
|
|
9
|
+
from lattifai.audio2 import AudioLoader, ChannelSelectorType
|
|
10
|
+
from lattifai.cli.alignment import align as alignment_align
|
|
11
|
+
from lattifai.config import (
|
|
12
|
+
AlignmentConfig,
|
|
13
|
+
CaptionConfig,
|
|
14
|
+
ClientConfig,
|
|
15
|
+
DiarizationConfig,
|
|
16
|
+
MediaConfig,
|
|
17
|
+
TranscriptionConfig,
|
|
18
|
+
)
|
|
19
|
+
from lattifai.utils import _resolve_model_path
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@run.cli.entrypoint(name="run", namespace="transcribe")
|
|
23
|
+
def transcribe(
|
|
24
|
+
input: Optional[str] = None,
|
|
25
|
+
output_caption: Optional[str] = None,
|
|
26
|
+
output_dir: Optional[Pathlike] = None,
|
|
27
|
+
media_format: str = "mp3",
|
|
28
|
+
channel_selector: Optional[ChannelSelectorType] = "average",
|
|
29
|
+
transcription: Annotated[Optional[TranscriptionConfig], run.Config[TranscriptionConfig]] = None,
|
|
30
|
+
):
|
|
31
|
+
"""
|
|
32
|
+
Transcribe audio/video file or YouTube URL to caption.
|
|
33
|
+
|
|
34
|
+
This command performs automatic speech recognition (ASR) on audio/video files
|
|
35
|
+
or YouTube videos, generating timestamped transcriptions in various caption formats.
|
|
36
|
+
|
|
37
|
+
Shortcut: invoking ``lai-transcribe`` is equivalent to running ``lai transcribe run``.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
input: Path to input audio/video file or YouTube URL (can be provided as positional argument)
|
|
41
|
+
output_caption: Path for output caption file (can be provided as positional argument)
|
|
42
|
+
output_dir: Directory for output files when using YouTube URL
|
|
43
|
+
media_format: Media format for YouTube downloads (default: mp3)
|
|
44
|
+
channel_selector: Audio channel selection strategy (default: average)
|
|
45
|
+
Options: average, left, right, or an integer channel index.
|
|
46
|
+
Note: Ignored when input is a URL and Gemini transcriber is used.
|
|
47
|
+
transcription: Transcription service configuration.
|
|
48
|
+
Fields: model_name, device, language, gemini_api_key
|
|
49
|
+
|
|
50
|
+
Examples:
|
|
51
|
+
# Transcribe local file with positional arguments
|
|
52
|
+
lai transcribe run audio.wav output.srt
|
|
53
|
+
|
|
54
|
+
# Transcribe YouTube video
|
|
55
|
+
lai transcribe run "https://www.youtube.com/watch?v=VIDEO_ID" ./output
|
|
56
|
+
|
|
57
|
+
# Using specific transcription model
|
|
58
|
+
lai transcribe run audio.mp4 output.ass \\
|
|
59
|
+
transcription.model_name=nvidia/parakeet-tdt-0.6b-v3
|
|
60
|
+
|
|
61
|
+
# Using Gemini transcription (requires API key)
|
|
62
|
+
lai transcribe run audio.wav output.srt \\
|
|
63
|
+
transcription.model_name=gemini-2.5-pro \\
|
|
64
|
+
transcription.gemini_api_key=YOUR_KEY
|
|
65
|
+
|
|
66
|
+
# Specify language for transcription
|
|
67
|
+
lai transcribe run audio.wav output.srt \\
|
|
68
|
+
transcription.language=zh
|
|
69
|
+
|
|
70
|
+
# Full configuration with keyword arguments
|
|
71
|
+
lai transcribe run \\
|
|
72
|
+
input=audio.wav \\
|
|
73
|
+
output_caption=output.srt \\
|
|
74
|
+
transcription.device=cuda \\
|
|
75
|
+
transcription.model_name=iic/SenseVoiceSmall
|
|
76
|
+
"""
|
|
77
|
+
import asyncio
|
|
78
|
+
from pathlib import Path
|
|
79
|
+
|
|
80
|
+
import colorful
|
|
81
|
+
|
|
82
|
+
from lattifai.transcription import create_transcriber
|
|
83
|
+
|
|
84
|
+
# Initialize transcription config with defaults
|
|
85
|
+
transcription_config = transcription or TranscriptionConfig()
|
|
86
|
+
|
|
87
|
+
# Validate input is required
|
|
88
|
+
if not input:
|
|
89
|
+
raise ValueError("Input is required. Provide input as positional argument (file path or URL).")
|
|
90
|
+
|
|
91
|
+
# Detect if input is a URL
|
|
92
|
+
is_url = input.startswith(("http://", "https://"))
|
|
93
|
+
|
|
94
|
+
# Prepare output paths
|
|
95
|
+
if is_url:
|
|
96
|
+
# For URLs, use output_dir
|
|
97
|
+
if output_dir:
|
|
98
|
+
output_path = Path(str(output_dir)).expanduser()
|
|
99
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
100
|
+
else:
|
|
101
|
+
output_path = Path.cwd()
|
|
102
|
+
else:
|
|
103
|
+
# For files, use input path directory
|
|
104
|
+
input_path = Path(str(input))
|
|
105
|
+
output_path = input_path.parent
|
|
106
|
+
|
|
107
|
+
# Create transcriber
|
|
108
|
+
if not transcription_config.lattice_model_path:
|
|
109
|
+
transcription_config.lattice_model_path = _resolve_model_path("Lattifai/Lattice-1")
|
|
110
|
+
transcriber = create_transcriber(transcription_config=transcription_config)
|
|
111
|
+
|
|
112
|
+
print(colorful.cyan(f"🎤 Starting transcription with {transcriber.name}..."))
|
|
113
|
+
print(colorful.cyan(f" Input: {input}"))
|
|
114
|
+
|
|
115
|
+
# Perform transcription
|
|
116
|
+
if is_url and transcriber.supports_url:
|
|
117
|
+
# Check if transcriber supports URL directly
|
|
118
|
+
print(colorful.cyan(" Transcribing from URL directly..."))
|
|
119
|
+
transcript = asyncio.run(transcriber.transcribe(input))
|
|
120
|
+
else:
|
|
121
|
+
if is_url:
|
|
122
|
+
# Download media first, then transcribe
|
|
123
|
+
print(colorful.cyan(" Downloading media from URL..."))
|
|
124
|
+
from lattifai.workflow.youtube import YouTubeDownloader
|
|
125
|
+
|
|
126
|
+
downloader = YouTubeDownloader()
|
|
127
|
+
input_path = asyncio.run(
|
|
128
|
+
downloader.download_media(
|
|
129
|
+
url=input,
|
|
130
|
+
output_dir=str(output_path),
|
|
131
|
+
media_format=media_format,
|
|
132
|
+
force_overwrite=False,
|
|
133
|
+
)
|
|
134
|
+
)
|
|
135
|
+
print(colorful.cyan(f" Media downloaded to: {input_path}"))
|
|
136
|
+
else:
|
|
137
|
+
input_path = Path(str(input))
|
|
138
|
+
|
|
139
|
+
print(colorful.cyan(" Loading audio..."))
|
|
140
|
+
# For files, load audio first
|
|
141
|
+
audio_loader = AudioLoader(device=transcription_config.device)
|
|
142
|
+
media_audio = audio_loader(input_path, channel_selector=channel_selector)
|
|
143
|
+
transcript = asyncio.run(transcriber.transcribe(media_audio))
|
|
144
|
+
|
|
145
|
+
# Determine output caption path
|
|
146
|
+
if output_caption:
|
|
147
|
+
final_output = Path(str(output_caption))
|
|
148
|
+
final_output.parent.mkdir(parents=True, exist_ok=True)
|
|
149
|
+
else:
|
|
150
|
+
if is_url:
|
|
151
|
+
# For URLs, generate output filename based on transcriber
|
|
152
|
+
output_format = transcriber.file_suffix.lstrip(".")
|
|
153
|
+
final_output = output_path / f"youtube_LattifAI_{transcriber.name}.{output_format}"
|
|
154
|
+
else:
|
|
155
|
+
# For files, use input filename with suffix
|
|
156
|
+
final_output = Path(str(input)).with_suffix(".LattifAI.srt")
|
|
157
|
+
|
|
158
|
+
print(colorful.cyan(f" Output: {final_output}"))
|
|
159
|
+
|
|
160
|
+
# Write output
|
|
161
|
+
transcriber.write(transcript, final_output, encoding="utf-8", cache_audio_events=False)
|
|
162
|
+
|
|
163
|
+
print(colorful.green(f"🎉 Transcription completed: {final_output}"))
|
|
164
|
+
|
|
165
|
+
return transcript
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
@run.cli.entrypoint(name="align", namespace="transcribe")
|
|
169
|
+
def transcribe_align(
|
|
170
|
+
input_media: Optional[str] = None,
|
|
171
|
+
output_caption: Optional[str] = None,
|
|
172
|
+
media: Annotated[Optional[MediaConfig], run.Config[MediaConfig]] = None,
|
|
173
|
+
caption: Annotated[Optional[CaptionConfig], run.Config[CaptionConfig]] = None,
|
|
174
|
+
client: Annotated[Optional[ClientConfig], run.Config[ClientConfig]] = None,
|
|
175
|
+
alignment: Annotated[Optional[AlignmentConfig], run.Config[AlignmentConfig]] = None,
|
|
176
|
+
transcription: Annotated[Optional[TranscriptionConfig], run.Config[TranscriptionConfig]] = None,
|
|
177
|
+
diarization: Annotated[Optional[DiarizationConfig], run.Config[DiarizationConfig]] = None,
|
|
178
|
+
):
|
|
179
|
+
return alignment_align(
|
|
180
|
+
input_media=input_media,
|
|
181
|
+
output_caption=output_caption,
|
|
182
|
+
media=media,
|
|
183
|
+
caption=caption,
|
|
184
|
+
client=client,
|
|
185
|
+
alignment=alignment,
|
|
186
|
+
transcription=transcription,
|
|
187
|
+
diarization=diarization,
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def main():
|
|
192
|
+
"""Entry point for lai-transcribe command."""
|
|
193
|
+
run.cli.main(transcribe)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
if __name__ == "__main__":
|
|
197
|
+
main()
|
lattifai/cli/youtube.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""YouTube workflow CLI entry point with nemo_run."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
import nemo_run as run
|
|
6
|
+
from typing_extensions import Annotated
|
|
7
|
+
|
|
8
|
+
from lattifai.client import LattifAI
|
|
9
|
+
from lattifai.config import (
|
|
10
|
+
AlignmentConfig,
|
|
11
|
+
CaptionConfig,
|
|
12
|
+
ClientConfig,
|
|
13
|
+
DiarizationConfig,
|
|
14
|
+
MediaConfig,
|
|
15
|
+
TranscriptionConfig,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@run.cli.entrypoint(name="youtube", namespace="alignment")
|
|
20
|
+
def youtube(
|
|
21
|
+
yt_url: Optional[str] = None,
|
|
22
|
+
media: Annotated[Optional[MediaConfig], run.Config[MediaConfig]] = None,
|
|
23
|
+
client: Annotated[Optional[ClientConfig], run.Config[ClientConfig]] = None,
|
|
24
|
+
alignment: Annotated[Optional[AlignmentConfig], run.Config[AlignmentConfig]] = None,
|
|
25
|
+
caption: Annotated[Optional[CaptionConfig], run.Config[CaptionConfig]] = None,
|
|
26
|
+
transcription: Annotated[Optional[TranscriptionConfig], run.Config[TranscriptionConfig]] = None,
|
|
27
|
+
diarization: Annotated[Optional[DiarizationConfig], run.Config[DiarizationConfig]] = None,
|
|
28
|
+
):
|
|
29
|
+
"""
|
|
30
|
+
Download media from YouTube (when needed) and align captions.
|
|
31
|
+
|
|
32
|
+
This command provides a convenient workflow for aligning captions with YouTube videos.
|
|
33
|
+
It can automatically download media from YouTube URLs and optionally transcribe audio
|
|
34
|
+
using Gemini or download available captions from YouTube.
|
|
35
|
+
|
|
36
|
+
When a YouTube URL is provided:
|
|
37
|
+
1. Downloads media in the specified format (audio or video)
|
|
38
|
+
2. Optionally transcribes audio with Gemini OR downloads YouTube captions
|
|
39
|
+
3. Performs forced alignment with the provided or generated captions
|
|
40
|
+
|
|
41
|
+
Shortcut: invoking ``lai-youtube`` is equivalent to running ``lai alignment youtube``.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
yt_url: YouTube video URL (can be provided as positional argument)
|
|
45
|
+
media: Media configuration for controlling formats and output directories.
|
|
46
|
+
Fields: input_path (YouTube URL), output_dir, output_format, force_overwrite
|
|
47
|
+
client: API client configuration.
|
|
48
|
+
Fields: api_key, timeout, max_retries
|
|
49
|
+
alignment: Alignment configuration (model selection and inference settings).
|
|
50
|
+
Fields: model_name, device, batch_size
|
|
51
|
+
caption: Caption configuration for reading/writing caption files.
|
|
52
|
+
Fields: output_format, output_path, normalize_text,
|
|
53
|
+
split_sentence, word_level, encoding
|
|
54
|
+
transcription: Transcription service configuration (enables Gemini transcription).
|
|
55
|
+
Fields: gemini_api_key, model_name, language, device
|
|
56
|
+
diarization: Speaker diarization configuration.
|
|
57
|
+
Fields: enabled, num_speakers, min_speakers, max_speakers, device
|
|
58
|
+
|
|
59
|
+
Examples:
|
|
60
|
+
# Download from YouTube and align (positional argument)
|
|
61
|
+
lai alignment youtube "https://www.youtube.com/watch?v=VIDEO_ID"
|
|
62
|
+
|
|
63
|
+
# With custom output directory and format
|
|
64
|
+
lai alignment youtube "https://www.youtube.com/watch?v=VIDEO_ID" \\
|
|
65
|
+
media.output_dir=/tmp/youtube \\
|
|
66
|
+
media.output_format=mp3
|
|
67
|
+
|
|
68
|
+
# Full configuration with smart splitting and word-level alignment
|
|
69
|
+
lai alignment youtube "https://www.youtube.com/watch?v=VIDEO_ID" \\
|
|
70
|
+
caption.output_path=aligned.srt \\
|
|
71
|
+
caption.split_sentence=true \\
|
|
72
|
+
caption.word_level=true \\
|
|
73
|
+
alignment.device=cuda
|
|
74
|
+
|
|
75
|
+
# Use Gemini transcription (requires API key)
|
|
76
|
+
lai alignment youtube "https://www.youtube.com/watch?v=VIDEO_ID" \\
|
|
77
|
+
transcription.gemini_api_key=YOUR_KEY \\
|
|
78
|
+
transcription.model_name=gemini-2.0-flash
|
|
79
|
+
|
|
80
|
+
# Using keyword argument (traditional syntax)
|
|
81
|
+
lai alignment youtube \\
|
|
82
|
+
yt_url="https://www.youtube.com/watch?v=VIDEO_ID" \\
|
|
83
|
+
alignment.device=mps
|
|
84
|
+
"""
|
|
85
|
+
# Initialize configs with defaults
|
|
86
|
+
media_config = media or MediaConfig()
|
|
87
|
+
caption_config = caption or CaptionConfig()
|
|
88
|
+
|
|
89
|
+
# Validate URL input: require exactly one of yt_url or media.input_path
|
|
90
|
+
if yt_url and media_config.input_path:
|
|
91
|
+
raise ValueError(
|
|
92
|
+
"Cannot specify both positional yt_url and media.input_path. "
|
|
93
|
+
"Use either positional argument or config, not both."
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
if not yt_url and not media_config.input_path:
|
|
97
|
+
raise ValueError("YouTube URL is required. Provide either positional yt_url or media.input_path parameter.")
|
|
98
|
+
|
|
99
|
+
# Assign yt_url to media_config.input_path if provided
|
|
100
|
+
if yt_url:
|
|
101
|
+
media_config.set_input_path(yt_url)
|
|
102
|
+
|
|
103
|
+
# Create LattifAI client with all configurations
|
|
104
|
+
lattifai_client = LattifAI(
|
|
105
|
+
client_config=client,
|
|
106
|
+
alignment_config=alignment,
|
|
107
|
+
caption_config=caption_config,
|
|
108
|
+
transcription_config=transcription,
|
|
109
|
+
diarization_config=diarization,
|
|
110
|
+
)
|
|
111
|
+
# Call the client's youtube method
|
|
112
|
+
return lattifai_client.youtube(
|
|
113
|
+
url=media_config.input_path,
|
|
114
|
+
output_dir=media_config.output_dir,
|
|
115
|
+
output_caption_path=caption_config.output_path,
|
|
116
|
+
media_format=media_config.normalize_format() if media_config.output_format else None,
|
|
117
|
+
force_overwrite=media_config.force_overwrite,
|
|
118
|
+
split_sentence=caption_config.split_sentence,
|
|
119
|
+
channel_selector=media_config.channel_selector,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def main():
|
|
124
|
+
run.cli.main(youtube)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
if __name__ == "__main__":
|
|
128
|
+
main()
|