lattifai 0.4.5__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/__init__.py +61 -47
- lattifai/alignment/__init__.py +6 -0
- lattifai/alignment/lattice1_aligner.py +119 -0
- lattifai/alignment/lattice1_worker.py +185 -0
- lattifai/{tokenizer → alignment}/phonemizer.py +4 -4
- lattifai/alignment/segmenter.py +166 -0
- lattifai/{tokenizer → alignment}/tokenizer.py +244 -169
- lattifai/audio2.py +211 -0
- lattifai/caption/__init__.py +20 -0
- lattifai/caption/caption.py +1275 -0
- lattifai/{io → caption}/gemini_reader.py +30 -30
- lattifai/{io → caption}/gemini_writer.py +17 -17
- lattifai/{io → caption}/supervision.py +4 -3
- lattifai/caption/text_parser.py +145 -0
- lattifai/cli/__init__.py +17 -0
- lattifai/cli/alignment.py +153 -0
- lattifai/cli/caption.py +204 -0
- lattifai/cli/server.py +19 -0
- lattifai/cli/transcribe.py +197 -0
- lattifai/cli/youtube.py +128 -0
- lattifai/client.py +460 -251
- lattifai/config/__init__.py +20 -0
- lattifai/config/alignment.py +73 -0
- lattifai/config/caption.py +178 -0
- lattifai/config/client.py +46 -0
- lattifai/config/diarization.py +67 -0
- lattifai/config/media.py +335 -0
- lattifai/config/transcription.py +84 -0
- lattifai/diarization/__init__.py +5 -0
- lattifai/diarization/lattifai.py +89 -0
- lattifai/errors.py +98 -91
- lattifai/logging.py +116 -0
- lattifai/mixin.py +552 -0
- lattifai/server/app.py +420 -0
- lattifai/transcription/__init__.py +76 -0
- lattifai/transcription/base.py +108 -0
- lattifai/transcription/gemini.py +219 -0
- lattifai/transcription/lattifai.py +103 -0
- lattifai/{workflows → transcription}/prompts/__init__.py +4 -4
- lattifai/types.py +30 -0
- lattifai/utils.py +16 -44
- lattifai/workflow/__init__.py +22 -0
- lattifai/workflow/agents.py +6 -0
- lattifai/{workflows → workflow}/base.py +22 -22
- lattifai/{workflows → workflow}/file_manager.py +239 -215
- lattifai/workflow/youtube.py +564 -0
- lattifai-1.0.0.dist-info/METADATA +736 -0
- lattifai-1.0.0.dist-info/RECORD +52 -0
- {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/WHEEL +1 -1
- lattifai-1.0.0.dist-info/entry_points.txt +13 -0
- {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/licenses/LICENSE +1 -1
- lattifai/base_client.py +0 -126
- lattifai/bin/__init__.py +0 -3
- lattifai/bin/agent.py +0 -325
- lattifai/bin/align.py +0 -296
- lattifai/bin/cli_base.py +0 -25
- lattifai/bin/subtitle.py +0 -210
- lattifai/io/__init__.py +0 -42
- lattifai/io/reader.py +0 -85
- lattifai/io/text_parser.py +0 -75
- lattifai/io/utils.py +0 -15
- lattifai/io/writer.py +0 -90
- lattifai/tokenizer/__init__.py +0 -3
- lattifai/workers/__init__.py +0 -3
- lattifai/workers/lattice1_alpha.py +0 -284
- lattifai/workflows/__init__.py +0 -34
- lattifai/workflows/agents.py +0 -10
- lattifai/workflows/gemini.py +0 -167
- lattifai/workflows/prompts/README.md +0 -22
- lattifai/workflows/prompts/gemini/README.md +0 -24
- lattifai/workflows/prompts/gemini/transcription_gem.txt +0 -81
- lattifai/workflows/youtube.py +0 -931
- lattifai-0.4.5.dist-info/METADATA +0 -808
- lattifai-0.4.5.dist-info/RECORD +0 -39
- lattifai-0.4.5.dist-info/entry_points.txt +0 -3
- {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,736 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: lattifai
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Lattifai Python SDK: Seamless Integration with Lattifai's Speech and Video AI Services
|
|
5
|
+
Author-email: Lattifai Technologies <tech@lattifai.com>
|
|
6
|
+
Maintainer-email: Lattice <tech@lattifai.com>
|
|
7
|
+
License: MIT License
|
|
8
|
+
|
|
9
|
+
Copyright (c) 2025 Lattifai.
|
|
10
|
+
|
|
11
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
12
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
13
|
+
in the Software without restriction, including without limitation the rights
|
|
14
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
15
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
16
|
+
furnished to do so, subject to the following conditions:
|
|
17
|
+
|
|
18
|
+
The above copyright notice and this permission notice shall be included in all
|
|
19
|
+
copies or substantial portions of the Software.
|
|
20
|
+
|
|
21
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
22
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
23
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
24
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
25
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
26
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
27
|
+
SOFTWARE.
|
|
28
|
+
|
|
29
|
+
Project-URL: Homepage, https://github.com/lattifai/lattifai-python
|
|
30
|
+
Project-URL: Documentation, https://github.com/lattifai/lattifai-python/README.md
|
|
31
|
+
Project-URL: Bug Tracker, https://github.com/lattifai/lattifai-python/issues
|
|
32
|
+
Project-URL: Discussions, https://github.com/lattifai/lattifai-python/discussions
|
|
33
|
+
Project-URL: Changelog, https://github.com/lattifai/lattifai-python/CHANGELOG.md
|
|
34
|
+
Keywords: lattifai,speech recognition,video analysis,ai,sdk,api client
|
|
35
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
36
|
+
Classifier: Intended Audience :: Developers
|
|
37
|
+
Classifier: Intended Audience :: Science/Research
|
|
38
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
41
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
42
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
43
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
44
|
+
Classifier: Operating System :: MacOS :: MacOS X
|
|
45
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
46
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
47
|
+
Classifier: Topic :: Multimedia :: Sound/Audio
|
|
48
|
+
Classifier: Topic :: Multimedia :: Video
|
|
49
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
50
|
+
Requires-Python: <3.15,>=3.10
|
|
51
|
+
Description-Content-Type: text/markdown
|
|
52
|
+
License-File: LICENSE
|
|
53
|
+
Requires-Dist: lattifai-core>=0.4.5
|
|
54
|
+
Requires-Dist: lattifai-run>=1.0.1
|
|
55
|
+
Requires-Dist: python-dotenv
|
|
56
|
+
Requires-Dist: lhotse>=1.26.0
|
|
57
|
+
Requires-Dist: colorful>=0.5.6
|
|
58
|
+
Requires-Dist: pysubs2
|
|
59
|
+
Requires-Dist: praatio
|
|
60
|
+
Requires-Dist: tgt
|
|
61
|
+
Requires-Dist: onnx>=1.16.0
|
|
62
|
+
Requires-Dist: onnxruntime
|
|
63
|
+
Requires-Dist: msgpack
|
|
64
|
+
Requires-Dist: g2p-phonemizer>=0.4.0
|
|
65
|
+
Requires-Dist: av
|
|
66
|
+
Requires-Dist: wtpsplit>=2.1.6
|
|
67
|
+
Requires-Dist: OmniSenseVoice>=0.4.0
|
|
68
|
+
Requires-Dist: nemo_toolkit_asr[asr]>=2.7.0rc1
|
|
69
|
+
Requires-Dist: pyannote-audio-notorchdeps>=4.0.2
|
|
70
|
+
Requires-Dist: questionary>=2.0
|
|
71
|
+
Requires-Dist: yt-dlp
|
|
72
|
+
Requires-Dist: pycryptodome
|
|
73
|
+
Requires-Dist: google-genai>=1.22.0
|
|
74
|
+
Requires-Dist: fastapi>=0.111.0
|
|
75
|
+
Requires-Dist: uvicorn>=0.30.0
|
|
76
|
+
Requires-Dist: python-multipart>=0.0.9
|
|
77
|
+
Requires-Dist: jinja2>=3.1.4
|
|
78
|
+
Provides-Extra: numpy
|
|
79
|
+
Requires-Dist: numpy; extra == "numpy"
|
|
80
|
+
Provides-Extra: diarization
|
|
81
|
+
Requires-Dist: torch-audiomentations==0.12.0; extra == "diarization"
|
|
82
|
+
Requires-Dist: pyannote.audio>=4.0.2; extra == "diarization"
|
|
83
|
+
Provides-Extra: transcription
|
|
84
|
+
Requires-Dist: OmniSenseVoice>=0.4.0; extra == "transcription"
|
|
85
|
+
Requires-Dist: nemo_toolkit_asr[asr]>=2.7.0rc1; extra == "transcription"
|
|
86
|
+
Provides-Extra: test
|
|
87
|
+
Requires-Dist: pytest; extra == "test"
|
|
88
|
+
Requires-Dist: pytest-cov; extra == "test"
|
|
89
|
+
Requires-Dist: pytest-asyncio; extra == "test"
|
|
90
|
+
Requires-Dist: numpy; extra == "test"
|
|
91
|
+
Provides-Extra: all
|
|
92
|
+
Requires-Dist: numpy; extra == "all"
|
|
93
|
+
Requires-Dist: pytest; extra == "all"
|
|
94
|
+
Requires-Dist: pytest-cov; extra == "all"
|
|
95
|
+
Requires-Dist: pytest-asyncio; extra == "all"
|
|
96
|
+
Requires-Dist: pyannote.audio>=4.0.2; extra == "all"
|
|
97
|
+
Dynamic: license-file
|
|
98
|
+
|
|
99
|
+
<div align="center">
|
|
100
|
+
<img src="https://raw.githubusercontent.com/lattifai/lattifai-python/main/assets/logo.png" width=256>
|
|
101
|
+
|
|
102
|
+
[](https://badge.fury.io/py/lattifai)
|
|
103
|
+
[](https://pypi.org/project/lattifai)
|
|
104
|
+
[](https://pepy.tech/project/lattifai)
|
|
105
|
+
</div>
|
|
106
|
+
|
|
107
|
+
<p align="center">
|
|
108
|
+
🌐 <a href="https://lattifai.com"><b>Official Website</b></a>    |    🖥️ <a href="https://github.com/lattifai/lattifai-python">GitHub</a>    |    🤗 <a href="https://huggingface.co/Lattifai/Lattice-1">Model</a>    |    📑 <a href="https://lattifai.com/blogs">Blog</a>    |    <a href="https://discord.gg/kvF4WsBRK8"><img src="https://img.shields.io/badge/Discord-Join-5865F2?logo=discord&logoColor=white" alt="Discord" style="vertical-align: middle;"></a>
|
|
109
|
+
</p>
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
# LattifAI: Precision Alignment, Infinite Possibilities
|
|
113
|
+
|
|
114
|
+
Advanced forced alignment and subtitle generation powered by [ 🤗 Lattice-1](https://huggingface.co/Lattifai/Lattice-1) model.
|
|
115
|
+
|
|
116
|
+
> **⚠️ Note on Current Limitations**:
|
|
117
|
+
> 1. **Memory Usage**: We are aware of high memory consumption and are actively working on further optimizations.
|
|
118
|
+
> 2. **Long Audio**: Currently, long audio files might face issues. Support for **long-form audio (up to 20 hours)** will be available shortly.
|
|
119
|
+
|
|
120
|
+
## Table of Contents
|
|
121
|
+
|
|
122
|
+
- [Installation](#installation)
|
|
123
|
+
- [Quick Start](#quick-start)
|
|
124
|
+
- [Command Line Interface](#command-line-interface)
|
|
125
|
+
- [Python SDK (5 Lines of Code)](#python-sdk-5-lines-of-code)
|
|
126
|
+
- [Web Interface](#web-interface)
|
|
127
|
+
- [CLI Reference](#cli-reference)
|
|
128
|
+
- [lai alignment align](#lai-alignment-align)
|
|
129
|
+
- [lai alignment youtube](#lai-alignment-youtube)
|
|
130
|
+
- [lai transcribe run](#lai-transcribe-run)
|
|
131
|
+
- [lai caption convert](#lai-caption-convert)
|
|
132
|
+
- [lai caption shift](#lai-caption-shift)
|
|
133
|
+
- [Python SDK Reference](#python-sdk-reference)
|
|
134
|
+
- [Basic Alignment](#basic-alignment)
|
|
135
|
+
- [YouTube Processing](#youtube-processing)
|
|
136
|
+
- [Configuration Objects](#configuration-objects)
|
|
137
|
+
- [Advanced Features](#advanced-features)
|
|
138
|
+
- [Word-Level Alignment](#word-level-alignment)
|
|
139
|
+
- [Smart Sentence Splitting](#smart-sentence-splitting)
|
|
140
|
+
- [Speaker Diarization](#speaker-diarization-wip)
|
|
141
|
+
- [YAML Configuration Files](#yaml-configuration-files)
|
|
142
|
+
- [Supported Formats](#supported-formats)
|
|
143
|
+
- [Roadmap](#roadmap)
|
|
144
|
+
- [Development](#development)
|
|
145
|
+
|
|
146
|
+
---
|
|
147
|
+
|
|
148
|
+
## Installation
|
|
149
|
+
|
|
150
|
+
### Step 1: Install SDK
|
|
151
|
+
|
|
152
|
+
**Using pip:**
|
|
153
|
+
```bash
|
|
154
|
+
|
|
155
|
+
pip install install-k2
|
|
156
|
+
install-k2 --torch-version 2.9.1 # if not set will auto-detect PyTorch version and install compatible k2
|
|
157
|
+
|
|
158
|
+
pip install lattifai==1.0.0
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
**Using uv (Recommended - 10-100x faster):**
|
|
162
|
+
```bash
|
|
163
|
+
# Install uv if you haven't already
|
|
164
|
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
165
|
+
|
|
166
|
+
# Create a new project with uv
|
|
167
|
+
uv init my-project
|
|
168
|
+
cd my-project
|
|
169
|
+
source .venv/bin/activate
|
|
170
|
+
|
|
171
|
+
# Install k2 (required dependency)
|
|
172
|
+
uv pip install install-k2
|
|
173
|
+
uv pip install pip
|
|
174
|
+
uv run install-k2 --torch-version 2.9.1
|
|
175
|
+
|
|
176
|
+
# Install LattifAI (v1.0.0)
|
|
177
|
+
uv pip install lattifai==1.0.0
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
> **Note**: `install-k2` automatically detects your PyTorch version (up to 2.9) and installs the compatible k2 wheel.
|
|
181
|
+
|
|
182
|
+
<details>
|
|
183
|
+
<summary><b>install-k2 options</b></summary>
|
|
184
|
+
|
|
185
|
+
```
|
|
186
|
+
usage: install-k2 [-h] [--system {linux,darwin,windows}] [--dry-run] [--torch-version TORCH_VERSION]
|
|
187
|
+
|
|
188
|
+
optional arguments:
|
|
189
|
+
-h, help Show this help message and exit
|
|
190
|
+
--system {linux,darwin,windows} Override OS detection
|
|
191
|
+
--dry-run Show what would be installed without making changes
|
|
192
|
+
--torch-version TORCH_VERSION Specify torch version (e.g., 2.8.0)
|
|
193
|
+
```
|
|
194
|
+
</details>
|
|
195
|
+
|
|
196
|
+
### Step 2: Get Your API Key
|
|
197
|
+
|
|
198
|
+
**LattifAI API Key (Required)**
|
|
199
|
+
|
|
200
|
+
Get your **free API key** at [https://lattifai.com/dashboard/api-keys](https://lattifai.com/dashboard/api-keys)
|
|
201
|
+
|
|
202
|
+
**Option A: Environment variable (recommended)**
|
|
203
|
+
```bash
|
|
204
|
+
export LATTIFAI_API_KEY="lf_your_api_key_here"
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
**Option B: `.env` file**
|
|
208
|
+
```bash
|
|
209
|
+
# .env
|
|
210
|
+
LATTIFAI_API_KEY=lf_your_api_key_here
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
**Gemini API Key (Optional - for transcription)**
|
|
214
|
+
|
|
215
|
+
If you want to use Gemini models for transcription (e.g., `gemini-2.5-pro`), get your **free Gemini API key** at [https://aistudio.google.com/apikey](https://aistudio.google.com/apikey)
|
|
216
|
+
|
|
217
|
+
```bash
|
|
218
|
+
# Add to environment variable
|
|
219
|
+
export GEMINI_API_KEY="your_gemini_api_key_here"
|
|
220
|
+
|
|
221
|
+
# Or add to .env file
|
|
222
|
+
GEMINI_API_KEY=your_gemini_api_key_here # AIzaSyxxxx
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
> **Note**: Gemini API key is only required if you use Gemini models for transcription. It's not needed for alignment or when using other transcription models.
|
|
226
|
+
|
|
227
|
+
---
|
|
228
|
+
|
|
229
|
+
## Quick Start
|
|
230
|
+
|
|
231
|
+
### Command Line Interface
|
|
232
|
+
|
|
233
|
+

|
|
234
|
+
|
|
235
|
+
```bash
|
|
236
|
+
# Align local audio with subtitle
|
|
237
|
+
lai alignment align audio.wav subtitle.srt output.srt
|
|
238
|
+
|
|
239
|
+
# Download and align YouTube video
|
|
240
|
+
lai alignment youtube "https://youtube.com/watch?v=VIDEO_ID"
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
### Python SDK (5 Lines of Code)
|
|
244
|
+
|
|
245
|
+
```python
|
|
246
|
+
from lattifai import LattifAI
|
|
247
|
+
|
|
248
|
+
client = LattifAI()
|
|
249
|
+
caption = client.alignment(
|
|
250
|
+
input_media="audio.wav",
|
|
251
|
+
input_caption="subtitle.srt",
|
|
252
|
+
output_caption_path="aligned.srt",
|
|
253
|
+
)
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
That's it! Your aligned subtitles are saved to `aligned.srt`.
|
|
257
|
+
|
|
258
|
+
### Web Interface
|
|
259
|
+
|
|
260
|
+
1. **Start the backend server:**
|
|
261
|
+
```bash
|
|
262
|
+
lai-server
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
2. **Start the frontend (in a new terminal):**
|
|
266
|
+
```bash
|
|
267
|
+
cd app
|
|
268
|
+
npm install
|
|
269
|
+
npm run dev
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
Visit `http://localhost:5173` to open the web interface.
|
|
273
|
+
|
|
274
|
+
---
|
|
275
|
+
|
|
276
|
+
## CLI Reference
|
|
277
|
+
|
|
278
|
+
### Command Overview
|
|
279
|
+
|
|
280
|
+
| Command | Description |
|
|
281
|
+
|---------|-------------|
|
|
282
|
+
| `lai alignment align` | Align local audio/video with caption |
|
|
283
|
+
| `lai alignment youtube` | Download & align YouTube content |
|
|
284
|
+
| `lai transcribe run` | Transcribe audio/video or YouTube URL to caption |
|
|
285
|
+
| `lai transcribe align` | Transcribe audio/video and align with generated transcript |
|
|
286
|
+
| `lai caption convert` | Convert between caption formats |
|
|
287
|
+
| `lai caption normalize` | Clean and normalize caption text |
|
|
288
|
+
| `lai caption shift` | Shift caption timestamps |
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
### lai alignment align
|
|
292
|
+
|
|
293
|
+
```bash
|
|
294
|
+
# Basic usage
|
|
295
|
+
lai alignment align <audio> <caption> <output>
|
|
296
|
+
|
|
297
|
+
# Examples
|
|
298
|
+
lai alignment align audio.wav caption.srt output.srt
|
|
299
|
+
lai alignment align video.mp4 caption.vtt output.srt alignment.device=cuda
|
|
300
|
+
lai alignment align audio.wav caption.srt output.json \
|
|
301
|
+
caption.split_sentence=true \
|
|
302
|
+
caption.word_level=true
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
### lai alignment youtube
|
|
306
|
+
|
|
307
|
+
```bash
|
|
308
|
+
# Basic usage
|
|
309
|
+
lai alignment youtube <url>
|
|
310
|
+
|
|
311
|
+
# Examples
|
|
312
|
+
lai alignment youtube "https://youtube.com/watch?v=VIDEO_ID"
|
|
313
|
+
lai alignment youtube "https://youtube.com/watch?v=VIDEO_ID" \
|
|
314
|
+
media.output_dir=~/Downloads \
|
|
315
|
+
caption.output_path=aligned.srt \
|
|
316
|
+
caption.split_sentence=true
|
|
317
|
+
```
|
|
318
|
+
|
|
319
|
+
### lai transcribe run
|
|
320
|
+
|
|
321
|
+
Perform automatic speech recognition (ASR) on audio/video files or YouTube URLs to generate timestamped transcriptions.
|
|
322
|
+
|
|
323
|
+
```bash
|
|
324
|
+
# Basic usage - local file
|
|
325
|
+
lai transcribe run <input> <output>
|
|
326
|
+
|
|
327
|
+
# Basic usage - YouTube URL
|
|
328
|
+
lai transcribe run <url> <output_dir>
|
|
329
|
+
|
|
330
|
+
# Examples - Local files
|
|
331
|
+
lai transcribe run audio.wav output.srt
|
|
332
|
+
lai transcribe run audio.mp4 output.ass \
|
|
333
|
+
transcription.model_name=nvidia/parakeet-tdt-0.6b-v3
|
|
334
|
+
|
|
335
|
+
# Examples - YouTube URLs
|
|
336
|
+
lai transcribe run "https://youtube.com/watch?v=VIDEO_ID" output_dir=./output
|
|
337
|
+
lai transcribe run "https://youtube.com/watch?v=VIDEO_ID" output.ass output_dir=./output \
|
|
338
|
+
transcription.model_name=gemini-2.5-pro \
|
|
339
|
+
transcription.gemini_api_key=YOUR_GEMINI_API_KEY
|
|
340
|
+
|
|
341
|
+
# Full configuration with keyword arguments
|
|
342
|
+
lai transcribe run \
|
|
343
|
+
input=audio.wav \
|
|
344
|
+
output_caption=output.srt \
|
|
345
|
+
channel_selector=average \
|
|
346
|
+
transcription.device=cuda \
|
|
347
|
+
transcription.model_name=iic/SenseVoiceSmall
|
|
348
|
+
```
|
|
349
|
+
|
|
350
|
+
**Parameters:**
|
|
351
|
+
- `input`: Path to audio/video file or YouTube URL (required)
|
|
352
|
+
- `output_caption`: Path for output caption file (for local files)
|
|
353
|
+
- `output_dir`: Directory for output files (for YouTube URLs, defaults to current directory)
|
|
354
|
+
- `media_format`: Media format for YouTube downloads (default: mp3)
|
|
355
|
+
- `channel_selector`: Audio channel selection - "average", "left", "right", or channel index (default: "average")
|
|
356
|
+
- Note: Ignored when transcribing YouTube URLs with Gemini models
|
|
357
|
+
- `transcription`: Transcription configuration (model_name, device, language, gemini_api_key)
|
|
358
|
+
|
|
359
|
+
**Supported Transcription Models (More Coming Soon):**
|
|
360
|
+
- `gemini-2.5-pro` - Google Gemini API (requires API key)
|
|
361
|
+
- Languages: 100+ languages including English, Chinese, Spanish, French, German, Japanese, Korean, Arabic, and more
|
|
362
|
+
- `gemini-3-pro-preview` - Google Gemini API (requires API key)
|
|
363
|
+
- Languages: 100+ languages (same as gemini-2.5-pro)
|
|
364
|
+
- `nvidia/parakeet-tdt-0.6b-v3` - NVIDIA Parakeet model
|
|
365
|
+
- Languages: Bulgarian (bg), Croatian (hr), Czech (cs), Danish (da), Dutch (nl), English (en), Estonian (et), Finnish (fi), French (fr), German (de), Greek (el), Hungarian (hu), Italian (it), Latvian (lv), Lithuanian (lt), Maltese (mt), Polish (pl), Portuguese (pt), Romanian (ro), Slovak (sk), Slovenian (sl), Spanish (es), Swedish (sv), Russian (ru), Ukrainian (uk)
|
|
366
|
+
- `iic/SenseVoiceSmall` - Alibaba SenseVoice model
|
|
367
|
+
- Languages: Chinese/Mandarin (zh), English (en), Japanese (ja), Korean (ko), Cantonese (yue)
|
|
368
|
+
- More models will be integrated in future releases
|
|
369
|
+
|
|
370
|
+
**Note:** For transcription with alignment on local files, use `lai transcribe align` instead.
|
|
371
|
+
|
|
372
|
+
### lai transcribe align
|
|
373
|
+
|
|
374
|
+
Transcribe audio/video file and automatically align the generated transcript with the audio.
|
|
375
|
+
|
|
376
|
+
This command combines transcription and alignment in a single step, producing precisely aligned captions.
|
|
377
|
+
|
|
378
|
+
```bash
|
|
379
|
+
# Basic usage
|
|
380
|
+
lai transcribe align <input_media> <output_caption>
|
|
381
|
+
|
|
382
|
+
# Examples
|
|
383
|
+
lai transcribe align audio.wav output.srt
|
|
384
|
+
lai transcribe align audio.mp4 output.ass \
|
|
385
|
+
transcription.model_name=nvidia/parakeet-tdt-0.6b-v3 \
|
|
386
|
+
alignment.device=cuda
|
|
387
|
+
|
|
388
|
+
# Using Gemini transcription with alignment
|
|
389
|
+
lai transcribe align audio.wav output.srt \
|
|
390
|
+
transcription.model_name=gemini-2.5-pro \
|
|
391
|
+
transcription.gemini_api_key=YOUR_KEY \
|
|
392
|
+
caption.split_sentence=true
|
|
393
|
+
|
|
394
|
+
# Full configuration
|
|
395
|
+
lai transcribe align \
|
|
396
|
+
input_media=audio.wav \
|
|
397
|
+
output_caption=output.srt \
|
|
398
|
+
transcription.device=mps \
|
|
399
|
+
transcription.model_name=iic/SenseVoiceSmall \
|
|
400
|
+
alignment.device=cuda \
|
|
401
|
+
caption.word_level=true
|
|
402
|
+
```
|
|
403
|
+
|
|
404
|
+
**Parameters:**
|
|
405
|
+
- `input_media`: Path to input audio/video file (required)
|
|
406
|
+
- `output_caption`: Path for output aligned caption file (required)
|
|
407
|
+
- `transcription`: Transcription configuration (model_name, device, language, gemini_api_key)
|
|
408
|
+
- `alignment`: Alignment configuration (model_name, device)
|
|
409
|
+
- `caption`: Caption formatting options (split_sentence, word_level, etc.)
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
### lai caption convert
|
|
413
|
+
|
|
414
|
+
```bash
|
|
415
|
+
lai caption convert input.srt output.vtt
|
|
416
|
+
lai caption convert input.srt output.json normalize_text=true
|
|
417
|
+
```
|
|
418
|
+
|
|
419
|
+
### lai caption shift
|
|
420
|
+
|
|
421
|
+
```bash
|
|
422
|
+
lai caption shift input.srt output.srt 2.0 # Delay by 2 seconds
|
|
423
|
+
lai caption shift input.srt output.srt -1.5 # Advance by 1.5 seconds
|
|
424
|
+
```
|
|
425
|
+
|
|
426
|
+
---
|
|
427
|
+
|
|
428
|
+
## Python SDK Reference
|
|
429
|
+
|
|
430
|
+
### Basic Alignment
|
|
431
|
+
|
|
432
|
+
```python
|
|
433
|
+
from lattifai import LattifAI
|
|
434
|
+
|
|
435
|
+
# Initialize client (uses LATTIFAI_API_KEY from environment)
|
|
436
|
+
client = LattifAI()
|
|
437
|
+
|
|
438
|
+
# Align audio/video with subtitle
|
|
439
|
+
caption = client.alignment(
|
|
440
|
+
input_media="audio.wav", # Audio or video file
|
|
441
|
+
input_caption="subtitle.srt", # Input subtitle file
|
|
442
|
+
output_caption_path="output.srt", # Output aligned subtitle
|
|
443
|
+
split_sentence=True, # Enable smart sentence splitting
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
# Access alignment results
|
|
447
|
+
for segment in caption.supervisions:
|
|
448
|
+
print(f"{segment.start:.2f}s - {segment.end:.2f}s: {segment.text}")
|
|
449
|
+
```
|
|
450
|
+
|
|
451
|
+
### YouTube Processing
|
|
452
|
+
|
|
453
|
+
```python
|
|
454
|
+
from lattifai import LattifAI
|
|
455
|
+
|
|
456
|
+
client = LattifAI()
|
|
457
|
+
|
|
458
|
+
# Download YouTube video and align with auto-downloaded subtitles
|
|
459
|
+
caption = client.youtube(
|
|
460
|
+
url="https://youtube.com/watch?v=VIDEO_ID",
|
|
461
|
+
output_dir="./downloads",
|
|
462
|
+
output_caption_path="aligned.srt",
|
|
463
|
+
split_sentence=True,
|
|
464
|
+
)
|
|
465
|
+
```
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
### Configuration Objects
|
|
469
|
+
|
|
470
|
+
LattifAI uses a config-driven architecture for fine-grained control:
|
|
471
|
+
|
|
472
|
+
#### ClientConfig - API Settings
|
|
473
|
+
|
|
474
|
+
```python
|
|
475
|
+
from lattifai import LattifAI, ClientConfig
|
|
476
|
+
|
|
477
|
+
client = LattifAI(
|
|
478
|
+
client_config=ClientConfig(
|
|
479
|
+
api_key="lf_your_api_key", # Or use LATTIFAI_API_KEY env var
|
|
480
|
+
timeout=30.0,
|
|
481
|
+
max_retries=3,
|
|
482
|
+
)
|
|
483
|
+
)
|
|
484
|
+
```
|
|
485
|
+
|
|
486
|
+
#### AlignmentConfig - Model Settings
|
|
487
|
+
|
|
488
|
+
```python
|
|
489
|
+
from lattifai import LattifAI, AlignmentConfig
|
|
490
|
+
|
|
491
|
+
client = LattifAI(
|
|
492
|
+
alignment_config=AlignmentConfig(
|
|
493
|
+
model_name="Lattifai/Lattice-1",
|
|
494
|
+
device="cuda", # "cpu", "cuda", "cuda:0", "mps"
|
|
495
|
+
)
|
|
496
|
+
)
|
|
497
|
+
```
|
|
498
|
+
|
|
499
|
+
#### CaptionConfig - Subtitle Settings
|
|
500
|
+
|
|
501
|
+
```python
|
|
502
|
+
from lattifai import LattifAI, CaptionConfig
|
|
503
|
+
|
|
504
|
+
client = LattifAI(
|
|
505
|
+
caption_config=CaptionConfig(
|
|
506
|
+
split_sentence=True, # Smart sentence splitting
|
|
507
|
+
word_level=True, # Word-level timestamps
|
|
508
|
+
normalize_text=True, # Clean HTML entities
|
|
509
|
+
include_speaker_in_text=False, # Include speaker labels
|
|
510
|
+
)
|
|
511
|
+
)
|
|
512
|
+
```
|
|
513
|
+
|
|
514
|
+
#### Complete Configuration Example
|
|
515
|
+
|
|
516
|
+
```python
|
|
517
|
+
from lattifai import (
|
|
518
|
+
LattifAI,
|
|
519
|
+
ClientConfig,
|
|
520
|
+
AlignmentConfig,
|
|
521
|
+
CaptionConfig
|
|
522
|
+
)
|
|
523
|
+
|
|
524
|
+
client = LattifAI(
|
|
525
|
+
client_config=ClientConfig(
|
|
526
|
+
api_key="lf_your_api_key",
|
|
527
|
+
timeout=60.0,
|
|
528
|
+
),
|
|
529
|
+
alignment_config=AlignmentConfig(
|
|
530
|
+
model_name="Lattifai/Lattice-1",
|
|
531
|
+
device="cuda",
|
|
532
|
+
),
|
|
533
|
+
caption_config=CaptionConfig(
|
|
534
|
+
split_sentence=True,
|
|
535
|
+
word_level=True,
|
|
536
|
+
output_format="json",
|
|
537
|
+
),
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
caption = client.alignment(
|
|
541
|
+
input_media="audio.wav",
|
|
542
|
+
input_caption="subtitle.srt",
|
|
543
|
+
output_caption_path="output.json",
|
|
544
|
+
)
|
|
545
|
+
```
|
|
546
|
+
|
|
547
|
+
### Available Exports
|
|
548
|
+
|
|
549
|
+
```python
|
|
550
|
+
from lattifai import (
|
|
551
|
+
# Client classes
|
|
552
|
+
LattifAI,
|
|
553
|
+
# AsyncLattifAI, # For async support
|
|
554
|
+
|
|
555
|
+
# Config classes
|
|
556
|
+
ClientConfig,
|
|
557
|
+
AlignmentConfig,
|
|
558
|
+
CaptionConfig,
|
|
559
|
+
DiarizationConfig,
|
|
560
|
+
MediaConfig,
|
|
561
|
+
|
|
562
|
+
# I/O classes
|
|
563
|
+
Caption,
|
|
564
|
+
)
|
|
565
|
+
```
|
|
566
|
+
|
|
567
|
+
---
|
|
568
|
+
|
|
569
|
+
## Advanced Features
|
|
570
|
+
|
|
571
|
+
### Word-Level Alignment
|
|
572
|
+
|
|
573
|
+
Enable `word_level=True` to get precise timestamps for each word:
|
|
574
|
+
|
|
575
|
+
```python
|
|
576
|
+
from lattifai import LattifAI, CaptionConfig
|
|
577
|
+
|
|
578
|
+
client = LattifAI(
|
|
579
|
+
caption_config=CaptionConfig(word_level=True)
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
caption = client.alignment(
|
|
583
|
+
input_media="audio.wav",
|
|
584
|
+
input_caption="subtitle.srt",
|
|
585
|
+
output_caption_path="output.json", # JSON preserves word-level data
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
# Access word-level alignments
|
|
589
|
+
for segment in caption.alignments:
|
|
590
|
+
if segment.alignment and "word" in segment.alignment:
|
|
591
|
+
for word_item in segment.alignment["word"]:
|
|
592
|
+
print(f"{word_item.start:.2f}s: {word_item.symbol} (confidence: {word_item.score:.2f})")
|
|
593
|
+
```
|
|
594
|
+
|
|
595
|
+
### Smart Sentence Splitting
|
|
596
|
+
|
|
597
|
+
The `split_sentence` option intelligently separates:
|
|
598
|
+
- Non-speech elements (`[APPLAUSE]`, `[MUSIC]`) from dialogue
|
|
599
|
+
- Multiple sentences within a single subtitle
|
|
600
|
+
- Speaker labels from content
|
|
601
|
+
|
|
602
|
+
```python
|
|
603
|
+
caption = client.alignment(
|
|
604
|
+
input_media="audio.wav",
|
|
605
|
+
input_caption="subtitle.srt",
|
|
606
|
+
split_sentence=True,
|
|
607
|
+
)
|
|
608
|
+
```
|
|
609
|
+
|
|
610
|
+
### Speaker Diarization (WIP)
|
|
611
|
+
|
|
612
|
+
**Note:** This feature is currently under development and not yet fully available.
|
|
613
|
+
|
|
614
|
+
Speaker diarization automatically identifies and labels different speakers in audio. When enabled, the system will:
|
|
615
|
+
- Detect speaker changes in the audio
|
|
616
|
+
- Assign speaker labels (e.g., SPEAKER_00, SPEAKER_01) to each segment
|
|
617
|
+
- Update subtitle segments with speaker information
|
|
618
|
+
|
|
619
|
+
**Speaker Name Handling:**
|
|
620
|
+
- **Existing speaker labels in subtitles**: If your input captions already contain speaker names (e.g., `[Alice]`, `>> Bob:`, or `SPEAKER_01:`), the system will preserve them as much as possible during alignment
|
|
621
|
+
- **Gemini Transcriber**: When using Gemini models for transcription (e.g., `gemini-2.5-pro`), the model can intelligently identify and extract speaker names from dialogue context, making it easier to generate speaker-aware transcripts
|
|
622
|
+
|
|
623
|
+
**Python SDK:**
|
|
624
|
+
```python
|
|
625
|
+
from lattifai import LattifAI, DiarizationConfig
|
|
626
|
+
|
|
627
|
+
client = LattifAI(
|
|
628
|
+
diarization_config=DiarizationConfig(enabled=True)
|
|
629
|
+
)
|
|
630
|
+
|
|
631
|
+
caption = client.alignment(
|
|
632
|
+
input_media="audio.wav",
|
|
633
|
+
input_caption="subtitle.srt",
|
|
634
|
+
output_caption_path="output.srt",
|
|
635
|
+
)
|
|
636
|
+
|
|
637
|
+
# Access speaker information
|
|
638
|
+
for segment in caption.supervisions:
|
|
639
|
+
print(f"[{segment.speaker}] {segment.text}")
|
|
640
|
+
```
|
|
641
|
+
|
|
642
|
+
### YAML Configuration Files
|
|
643
|
+
|
|
644
|
+
Create reusable configuration files:
|
|
645
|
+
|
|
646
|
+
```yaml
|
|
647
|
+
# config/alignment.yaml
|
|
648
|
+
model_name: "Lattifai/Lattice-1"
|
|
649
|
+
device: "cuda"
|
|
650
|
+
batch_size: 1
|
|
651
|
+
```
|
|
652
|
+
|
|
653
|
+
```bash
|
|
654
|
+
lai alignment align audio.wav subtitle.srt output.srt \
|
|
655
|
+
alignment=config/alignment.yaml
|
|
656
|
+
```
|
|
657
|
+
|
|
658
|
+
---
|
|
659
|
+
|
|
660
|
+
## Supported Formats
|
|
661
|
+
|
|
662
|
+
LattifAI supports virtually all common media and subtitle formats:
|
|
663
|
+
|
|
664
|
+
| Type | Formats |
|
|
665
|
+
|------|---------|
|
|
666
|
+
| **Audio** | WAV, MP3, M4A, AAC, FLAC, OGG, OPUS, AIFF, and more |
|
|
667
|
+
| **Video** | MP4, MKV, MOV, WEBM, AVI, and more |
|
|
668
|
+
| **Caption/Subtitle Input** | SRT, VTT, ASS, SSA, SUB, SBV, TXT, Gemini, and more |
|
|
669
|
+
| **Caption/Subtitle Output** | All input formats + TextGrid (Praat) |
|
|
670
|
+
|
|
671
|
+
**Tabular Formats:**
|
|
672
|
+
- **TSV**: Tab-separated values with optional speaker column
|
|
673
|
+
- **CSV**: Comma-separated values with optional speaker column
|
|
674
|
+
- **AUD**: Audacity labels format with `[[speaker]]` notation
|
|
675
|
+
|
|
676
|
+
> **Note**: If a format is not listed above but commonly used, it's likely supported. Feel free to try it or reach out if you encounter any issues.
|
|
677
|
+
|
|
678
|
+
---
|
|
679
|
+
|
|
680
|
+
## Roadmap
|
|
681
|
+
|
|
682
|
+
Visit our [LattifAI roadmap](https://lattifai.com/roadmap) for the latest updates.
|
|
683
|
+
|
|
684
|
+
| Date | Release | Features |
|
|
685
|
+
|------|---------|----------|
|
|
686
|
+
| **Oct 2025** | **Lattice-1-Alpha** | ✅ English forced alignment<br>✅ Multi-format support<br>✅ CPU/GPU optimization |
|
|
687
|
+
| **Nov 2025** | **Lattice-1** | ✅ English + Chinese + German<br>✅ Mixed languages alignment<br>🚀 Integrate Speaker Diarization |
|
|
688
|
+
|
|
689
|
+
---
|
|
690
|
+
|
|
691
|
+
## Development
|
|
692
|
+
|
|
693
|
+
### Setup
|
|
694
|
+
|
|
695
|
+
```bash
|
|
696
|
+
git clone https://github.com/lattifai/lattifai-python.git
|
|
697
|
+
cd lattifai-python
|
|
698
|
+
|
|
699
|
+
# Using uv (recommended)
|
|
700
|
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
701
|
+
uv sync
|
|
702
|
+
source .venv/bin/activate
|
|
703
|
+
|
|
704
|
+
# Or using pip
|
|
705
|
+
pip install -e ".[test]"
|
|
706
|
+
|
|
707
|
+
pre-commit install
|
|
708
|
+
```
|
|
709
|
+
|
|
710
|
+
### Testing
|
|
711
|
+
|
|
712
|
+
```bash
|
|
713
|
+
pytest # Run all tests
|
|
714
|
+
pytest --cov=src # With coverage
|
|
715
|
+
pytest tests/test_basic.py # Specific test
|
|
716
|
+
```
|
|
717
|
+
|
|
718
|
+
---
|
|
719
|
+
|
|
720
|
+
## Contributing
|
|
721
|
+
|
|
722
|
+
1. Fork the repository
|
|
723
|
+
2. Create a feature branch
|
|
724
|
+
3. Make changes and add tests
|
|
725
|
+
4. Run `pytest` and `pre-commit run`
|
|
726
|
+
5. Submit a pull request
|
|
727
|
+
|
|
728
|
+
## License
|
|
729
|
+
|
|
730
|
+
Apache License 2.0
|
|
731
|
+
|
|
732
|
+
## Support
|
|
733
|
+
|
|
734
|
+
- **Issues**: [GitHub Issues](https://github.com/lattifai/lattifai-python/issues)
|
|
735
|
+
- **Discussions**: [GitHub Discussions](https://github.com/lattifai/lattifai-python/discussions)
|
|
736
|
+
- **Discord**: [Join our community](https://discord.gg/kvF4WsBRK8)
|