lattifai 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/_init.py +20 -0
- lattifai/alignment/__init__.py +9 -1
- lattifai/alignment/lattice1_aligner.py +175 -54
- lattifai/alignment/lattice1_worker.py +47 -4
- lattifai/alignment/punctuation.py +38 -0
- lattifai/alignment/segmenter.py +3 -2
- lattifai/alignment/text_align.py +441 -0
- lattifai/alignment/tokenizer.py +134 -65
- lattifai/audio2.py +162 -183
- lattifai/cli/__init__.py +2 -1
- lattifai/cli/alignment.py +5 -0
- lattifai/cli/caption.py +111 -4
- lattifai/cli/transcribe.py +2 -6
- lattifai/cli/youtube.py +7 -1
- lattifai/client.py +72 -123
- lattifai/config/__init__.py +28 -0
- lattifai/config/alignment.py +14 -0
- lattifai/config/caption.py +45 -31
- lattifai/config/client.py +16 -0
- lattifai/config/event.py +102 -0
- lattifai/config/media.py +20 -0
- lattifai/config/transcription.py +25 -1
- lattifai/data/__init__.py +8 -0
- lattifai/data/caption.py +228 -0
- lattifai/diarization/__init__.py +41 -1
- lattifai/errors.py +78 -53
- lattifai/event/__init__.py +65 -0
- lattifai/event/lattifai.py +166 -0
- lattifai/mixin.py +49 -32
- lattifai/transcription/base.py +8 -2
- lattifai/transcription/gemini.py +147 -16
- lattifai/transcription/lattifai.py +25 -63
- lattifai/types.py +1 -1
- lattifai/utils.py +7 -13
- lattifai/workflow/__init__.py +28 -4
- lattifai/workflow/file_manager.py +2 -5
- lattifai/youtube/__init__.py +43 -0
- lattifai/youtube/client.py +1265 -0
- lattifai/youtube/types.py +23 -0
- lattifai-1.3.0.dist-info/METADATA +678 -0
- lattifai-1.3.0.dist-info/RECORD +57 -0
- {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/entry_points.txt +1 -2
- lattifai/__init__.py +0 -88
- lattifai/alignment/sentence_splitter.py +0 -219
- lattifai/caption/__init__.py +0 -20
- lattifai/caption/caption.py +0 -1467
- lattifai/caption/gemini_reader.py +0 -462
- lattifai/caption/gemini_writer.py +0 -173
- lattifai/caption/supervision.py +0 -34
- lattifai/caption/text_parser.py +0 -145
- lattifai/cli/app_installer.py +0 -142
- lattifai/cli/server.py +0 -44
- lattifai/server/app.py +0 -427
- lattifai/workflow/youtube.py +0 -577
- lattifai-1.2.1.dist-info/METADATA +0 -1134
- lattifai-1.2.1.dist-info/RECORD +0 -58
- {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/WHEEL +0 -0
- {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Any, List, Optional
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class VideoMetadata:
|
|
7
|
+
video_id: str
|
|
8
|
+
title: str
|
|
9
|
+
description: str
|
|
10
|
+
duration: float # seconds
|
|
11
|
+
thumbnail_url: str
|
|
12
|
+
channel_name: str
|
|
13
|
+
view_count: int
|
|
14
|
+
upload_date: Optional[str] = None
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class CaptionTrack:
|
|
19
|
+
language_code: str
|
|
20
|
+
language_name: str
|
|
21
|
+
kind: str # 'manual' | 'asr'
|
|
22
|
+
ext: str # 'vtt', 'srv3' etc
|
|
23
|
+
url: Optional[str] = None
|
|
@@ -0,0 +1,678 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: lattifai
|
|
3
|
+
Version: 1.3.0
|
|
4
|
+
Summary: Lattifai Python SDK: Seamless Integration with Lattifai's Speech and Video AI Services
|
|
5
|
+
Author-email: Lattifai Technologies <tech@lattifai.com>
|
|
6
|
+
Maintainer-email: Lattice <tech@lattifai.com>
|
|
7
|
+
License: MIT License
|
|
8
|
+
|
|
9
|
+
Copyright (c) 2025 LattifAI.
|
|
10
|
+
|
|
11
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
12
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
13
|
+
in the Software without restriction, including without limitation the rights
|
|
14
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
15
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
16
|
+
furnished to do so, subject to the following conditions:
|
|
17
|
+
|
|
18
|
+
The above copyright notice and this permission notice shall be included in all
|
|
19
|
+
copies or substantial portions of the Software.
|
|
20
|
+
|
|
21
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
22
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
23
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
24
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
25
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
26
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
27
|
+
SOFTWARE.
|
|
28
|
+
|
|
29
|
+
Project-URL: Homepage, https://github.com/lattifai/lattifai-python
|
|
30
|
+
Project-URL: Documentation, https://github.com/lattifai/lattifai-python/blob/main/README.md
|
|
31
|
+
Project-URL: Bug Tracker, https://github.com/lattifai/lattifai-python/issues
|
|
32
|
+
Project-URL: Discussions, https://github.com/lattifai/lattifai-python/discussions
|
|
33
|
+
Project-URL: Changelog, https://github.com/lattifai/lattifai-python/blob/main/CHANGELOG.md
|
|
34
|
+
Keywords: lattifai,speech recognition,video analysis,ai,sdk,api client
|
|
35
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
36
|
+
Classifier: Intended Audience :: Developers
|
|
37
|
+
Classifier: Intended Audience :: Science/Research
|
|
38
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
41
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
42
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
43
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
44
|
+
Classifier: Operating System :: MacOS :: MacOS X
|
|
45
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
46
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
47
|
+
Classifier: Topic :: Multimedia :: Sound/Audio
|
|
48
|
+
Classifier: Topic :: Multimedia :: Video
|
|
49
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
50
|
+
Requires-Python: <3.15,>=3.10
|
|
51
|
+
Description-Content-Type: text/markdown
|
|
52
|
+
License-File: LICENSE
|
|
53
|
+
Requires-Dist: python-dotenv
|
|
54
|
+
Requires-Dist: colorful>=0.5.6
|
|
55
|
+
Requires-Dist: lattifai-run>=1.0.1
|
|
56
|
+
Requires-Dist: lattifai-captions[splitting]>=0.1.6
|
|
57
|
+
Requires-Dist: lattifai-core-hq>=0.6.4
|
|
58
|
+
Requires-Dist: g2p-phonemizer>=0.4.0
|
|
59
|
+
Requires-Dist: error-align-fix>=0.1.4
|
|
60
|
+
Requires-Dist: lhotse>=1.26.0
|
|
61
|
+
Requires-Dist: k2py==0.2.4
|
|
62
|
+
Requires-Dist: onnxruntime
|
|
63
|
+
Requires-Dist: av
|
|
64
|
+
Requires-Dist: msgpack
|
|
65
|
+
Provides-Extra: event
|
|
66
|
+
Requires-Dist: pyannote-audio-notorchdeps>=4.0.2; extra == "event"
|
|
67
|
+
Provides-Extra: diarization
|
|
68
|
+
Requires-Dist: nemo_toolkit_asr[asr]>=2.7.0rc4; extra == "diarization"
|
|
69
|
+
Requires-Dist: pyannote-audio-notorchdeps>=4.0.2; extra == "diarization"
|
|
70
|
+
Provides-Extra: transcription
|
|
71
|
+
Requires-Dist: OmniSenseVoice>=0.4.2; extra == "transcription"
|
|
72
|
+
Requires-Dist: nemo_toolkit_asr[asr]>=2.7.0rc4; extra == "transcription"
|
|
73
|
+
Requires-Dist: google-genai>=1.22.0; extra == "transcription"
|
|
74
|
+
Requires-Dist: pyannote-audio-notorchdeps>=4.0.2; extra == "transcription"
|
|
75
|
+
Provides-Extra: youtube
|
|
76
|
+
Requires-Dist: questionary>=2.0; extra == "youtube"
|
|
77
|
+
Requires-Dist: yt-dlp; extra == "youtube"
|
|
78
|
+
Requires-Dist: pycryptodome; extra == "youtube"
|
|
79
|
+
Provides-Extra: dev
|
|
80
|
+
Requires-Dist: black; extra == "dev"
|
|
81
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
82
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
|
83
|
+
Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
|
|
84
|
+
Provides-Extra: all
|
|
85
|
+
Requires-Dist: lattifai[transcription]; extra == "all"
|
|
86
|
+
Requires-Dist: lattifai[youtube]; extra == "all"
|
|
87
|
+
Dynamic: license-file
|
|
88
|
+
|
|
89
|
+
<div align="center">
|
|
90
|
+
<img src="https://raw.githubusercontent.com/lattifai/lattifai-python/main/assets/logo.png" width=256>
|
|
91
|
+
|
|
92
|
+
[](https://badge.fury.io/py/lattifai)
|
|
93
|
+
[](https://pypi.org/project/lattifai)
|
|
94
|
+
[](https://pepy.tech/project/lattifai)
|
|
95
|
+
</div>
|
|
96
|
+
|
|
97
|
+
<p align="center">
|
|
98
|
+
🌐 <a href="https://lattifai.com"><b>Official Website</b></a> | 🖥️ <a href="https://github.com/lattifai/lattifai-python">GitHub</a> | 🤗 <a href="https://huggingface.co/LattifAI/Lattice-1">Model</a> | 📑 <a href="https://lattifai.com/blogs">Blog</a> | <a href="https://discord.gg/kvF4WsBRK8"><img src="https://img.shields.io/badge/Discord-Join-5865F2?logo=discord&logoColor=white" alt="Discord" style="vertical-align: middle;"></a>
|
|
99
|
+
</p>
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
# LattifAI: Precision Alignment, Infinite Possibilities
|
|
103
|
+
|
|
104
|
+
Advanced forced alignment and subtitle generation powered by [ 🤗 Lattice-1](https://huggingface.co/LattifAI/Lattice-1) model.
|
|
105
|
+
|
|
106
|
+
## Table of Contents
|
|
107
|
+
|
|
108
|
+
- [Features](#features)
|
|
109
|
+
- [Installation](#installation)
|
|
110
|
+
- [Quick Start](#quick-start)
|
|
111
|
+
- [CLI Reference](#cli-reference)
|
|
112
|
+
- [Python SDK](#python-sdk)
|
|
113
|
+
- [Advanced Features](#advanced-features)
|
|
114
|
+
- [Text Processing](#text-processing)
|
|
115
|
+
- [Supported Formats & Languages](#supported-formats--languages)
|
|
116
|
+
- [Roadmap](#roadmap)
|
|
117
|
+
- [Development](#development)
|
|
118
|
+
|
|
119
|
+
---
|
|
120
|
+
|
|
121
|
+
## Features
|
|
122
|
+
|
|
123
|
+
| Feature | Description |
|
|
124
|
+
|---------|-------------|
|
|
125
|
+
| **Forced Alignment** | Word-level and segment-level audio-text synchronization powered by [Lattice-1](https://huggingface.co/LattifAI/Lattice-1) |
|
|
126
|
+
| **Multi-Model Transcription** | Gemini (100+ languages), Parakeet (24 languages), SenseVoice (5 languages) |
|
|
127
|
+
| **Speaker Diarization** | Multi-speaker identification with label preservation |
|
|
128
|
+
| **Streaming Mode** | Process audio up to 20 hours with minimal memory |
|
|
129
|
+
| **Universal Format Support** | 30+ caption/subtitle formats |
|
|
130
|
+
|
|
131
|
+
### Alignment Models
|
|
132
|
+
|
|
133
|
+
| Model | Links | Languages | Description |
|
|
134
|
+
|-------|-------|-----------|-------------|
|
|
135
|
+
| **Lattice-1** | [🤗 HF](https://huggingface.co/LattifAI/Lattice-1) • [🤖 MS](https://modelscope.cn/models/LattifAI/Lattice-1) | English, Chinese, German | Production model with mixed-language alignment support |
|
|
136
|
+
| **Lattice-1-Alpha** | [🤗 HF](https://huggingface.co/LattifAI/Lattice-1-Alpha) • [🤖 MS](https://modelscope.cn/models/LattifAI/Lattice-1-Alpha) | English | Initial release with English forced alignment |
|
|
137
|
+
|
|
138
|
+
**Model Hub**: Models can be downloaded from `huggingface` (default) or `modelscope` (recommended for users in China):
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
# Use ModelScope (faster in China)
|
|
142
|
+
lai alignment align audio.wav caption.srt output.srt alignment.model_hub=modelscope
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
```python
|
|
146
|
+
from lattifai.client import LattifAI
|
|
147
|
+
from lattifai.config import AlignmentConfig
|
|
148
|
+
|
|
149
|
+
client = LattifAI(alignment_config=AlignmentConfig(model_hub="modelscope"))
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
---
|
|
153
|
+
|
|
154
|
+
## Installation
|
|
155
|
+
|
|
156
|
+
### Using uv (Recommended)
|
|
157
|
+
|
|
158
|
+
[uv](https://github.com/astral-sh/uv) is a fast Python package manager (10-100x faster than pip).
|
|
159
|
+
|
|
160
|
+
```bash
|
|
161
|
+
# Install uv
|
|
162
|
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
163
|
+
|
|
164
|
+
# Quick start (run without installing)
|
|
165
|
+
uvx --from lattifai lai --help
|
|
166
|
+
|
|
167
|
+
# Or create a project
|
|
168
|
+
mkdir my-project && cd my-project
|
|
169
|
+
uv init --bare && uv add "lattifai[all]"
|
|
170
|
+
uv run lai alignment align audio.wav caption.srt output.srt
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
### Using pip
|
|
174
|
+
|
|
175
|
+
```bash
|
|
176
|
+
# Full installation (recommended)
|
|
177
|
+
pip install "lattifai[all]"
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
### Installation Options
|
|
181
|
+
|
|
182
|
+
| Extra | Command | Includes |
|
|
183
|
+
|-------|---------|----------|
|
|
184
|
+
| (base) | `pip install lattifai` | Forced alignment (Lattice-1, k2py, ONNX, captions) |
|
|
185
|
+
| `all` | `pip install "lattifai[all]"` | Base + transcription + youtube |
|
|
186
|
+
| `transcription` | `pip install "lattifai[transcription]"` | ASR models (Gemini, Parakeet, SenseVoice) |
|
|
187
|
+
| `youtube` | `pip install "lattifai[youtube]"` | YouTube download (yt-dlp) |
|
|
188
|
+
| `diarization` | `pip install "lattifai[diarization]"` | Speaker diarization (NeMo, pyannote) |
|
|
189
|
+
| `event` | `pip install "lattifai[event]"` | Audio event detection |
|
|
190
|
+
|
|
191
|
+
**Note:** Base installation includes full alignment functionality. Use `[all]` for transcription and YouTube features.
|
|
192
|
+
|
|
193
|
+
### Caption Format Support
|
|
194
|
+
|
|
195
|
+
Caption/subtitle format parsing is provided by [lattifai-captions](https://github.com/lattifai/captions), a separate package supporting 30+ formats (SRT, VTT, ASS, TTML, TextGrid, NLE formats, etc.). It is automatically installed with `lattifai[core]` or `lattifai[all]`.
|
|
196
|
+
|
|
197
|
+
### API Keys
|
|
198
|
+
|
|
199
|
+
**LattifAI API Key (Required)** - Get your free key at [lattifai.com/dashboard/api-keys](https://lattifai.com/dashboard/api-keys)
|
|
200
|
+
|
|
201
|
+
```bash
|
|
202
|
+
export LATTIFAI_API_KEY="lf_your_api_key_here"
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
**Gemini API Key (Optional)** - For transcription with Gemini models, get key at [aistudio.google.com/apikey](https://aistudio.google.com/apikey)
|
|
206
|
+
|
|
207
|
+
```bash
|
|
208
|
+
export GEMINI_API_KEY="your_gemini_api_key_here"
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
Or use a `.env` file:
|
|
212
|
+
```bash
|
|
213
|
+
LATTIFAI_API_KEY=lf_your_api_key_here
|
|
214
|
+
GEMINI_API_KEY=your_gemini_api_key_here
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
---
|
|
218
|
+
|
|
219
|
+
## Quick Start
|
|
220
|
+
|
|
221
|
+
### Command Line
|
|
222
|
+
|
|
223
|
+
```bash
|
|
224
|
+
# Align audio with subtitle
|
|
225
|
+
lai alignment align audio.wav subtitle.srt output.srt
|
|
226
|
+
|
|
227
|
+
# YouTube video
|
|
228
|
+
lai alignment youtube "https://youtube.com/watch?v=VIDEO_ID"
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
### Python SDK
|
|
232
|
+
|
|
233
|
+
```python
|
|
234
|
+
from lattifai.client import LattifAI
|
|
235
|
+
|
|
236
|
+
client = LattifAI()
|
|
237
|
+
caption = client.alignment(
|
|
238
|
+
input_media="audio.wav",
|
|
239
|
+
input_caption="subtitle.srt",
|
|
240
|
+
output_caption_path="aligned.srt",
|
|
241
|
+
)
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
---
|
|
245
|
+
|
|
246
|
+
## CLI Reference
|
|
247
|
+
|
|
248
|
+
| Command | Description | Example |
|
|
249
|
+
|---------|-------------|---------|
|
|
250
|
+
| `lai alignment align` | Align audio/video with caption | `lai alignment align audio.wav caption.srt output.srt` |
|
|
251
|
+
| `lai alignment youtube` | Download & align YouTube | `lai alignment youtube "https://youtube.com/watch?v=ID"` |
|
|
252
|
+
| `lai transcribe run` | Transcribe audio/video | `lai transcribe run audio.wav output.srt` |
|
|
253
|
+
| `lai transcribe align` | Transcribe and align | `lai transcribe align audio.wav output.srt` |
|
|
254
|
+
| `lai caption convert` | Convert caption formats | `lai caption convert input.srt output.vtt` |
|
|
255
|
+
| `lai caption shift` | Shift timestamps | `lai caption shift input.srt output.srt 2.0` |
|
|
256
|
+
|
|
257
|
+
### Common Options
|
|
258
|
+
|
|
259
|
+
```bash
|
|
260
|
+
# Device selection
|
|
261
|
+
alignment.device=cuda # cuda, mps, cpu
|
|
262
|
+
|
|
263
|
+
# Caption options
|
|
264
|
+
caption.split_sentence=true # Smart sentence splitting
|
|
265
|
+
caption.word_level=true # Word-level timestamps
|
|
266
|
+
|
|
267
|
+
# Streaming for long audio
|
|
268
|
+
media.streaming_chunk_secs=600
|
|
269
|
+
|
|
270
|
+
# Channel selection
|
|
271
|
+
media.channel_selector=left # left, right, average, or index
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
### Transcription Models
|
|
275
|
+
|
|
276
|
+
```bash
|
|
277
|
+
# Gemini (100+ languages, requires GEMINI_API_KEY)
|
|
278
|
+
transcription.model_name=gemini-2.5-pro
|
|
279
|
+
|
|
280
|
+
# Parakeet (24 European languages)
|
|
281
|
+
transcription.model_name=nvidia/parakeet-tdt-0.6b-v3
|
|
282
|
+
|
|
283
|
+
# SenseVoice (zh, en, ja, ko, yue)
|
|
284
|
+
transcription.model_name=iic/SenseVoiceSmall
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
### lai transcribe run
|
|
288
|
+
|
|
289
|
+
Transcribe audio/video files or YouTube URLs to generate timestamped captions.
|
|
290
|
+
|
|
291
|
+
```bash
|
|
292
|
+
# Local file
|
|
293
|
+
lai transcribe run audio.wav output.srt
|
|
294
|
+
|
|
295
|
+
# YouTube URL
|
|
296
|
+
lai transcribe run "https://youtube.com/watch?v=VIDEO_ID" output_dir=./output
|
|
297
|
+
|
|
298
|
+
# With model selection
|
|
299
|
+
lai transcribe run audio.wav output.srt \
|
|
300
|
+
transcription.model_name=gemini-2.5-pro \
|
|
301
|
+
transcription.device=cuda
|
|
302
|
+
```
|
|
303
|
+
|
|
304
|
+
**Parameters:**
|
|
305
|
+
- `input`: Path to audio/video file or YouTube URL
|
|
306
|
+
- `output_caption`: Output caption file path (for local files)
|
|
307
|
+
- `output_dir`: Output directory (for YouTube URLs, defaults to current directory)
|
|
308
|
+
- `channel_selector`: Audio channel - `average` (default), `left`, `right`, or channel index
|
|
309
|
+
|
|
310
|
+
### lai transcribe align
|
|
311
|
+
|
|
312
|
+
Transcribe and align in a single step - produces precisely aligned captions.
|
|
313
|
+
|
|
314
|
+
```bash
|
|
315
|
+
# Basic usage
|
|
316
|
+
lai transcribe align audio.wav output.srt
|
|
317
|
+
|
|
318
|
+
# With options
|
|
319
|
+
lai transcribe align audio.wav output.srt \
|
|
320
|
+
transcription.model_name=nvidia/parakeet-tdt-0.6b-v3 \
|
|
321
|
+
alignment.device=cuda \
|
|
322
|
+
caption.split_sentence=true \
|
|
323
|
+
caption.word_level=true
|
|
324
|
+
```
|
|
325
|
+
|
|
326
|
+
---
|
|
327
|
+
|
|
328
|
+
## Python SDK
|
|
329
|
+
|
|
330
|
+
### Configuration Objects
|
|
331
|
+
|
|
332
|
+
```python
|
|
333
|
+
from lattifai.client import LattifAI
|
|
334
|
+
from lattifai.config import (
|
|
335
|
+
ClientConfig,
|
|
336
|
+
AlignmentConfig,
|
|
337
|
+
CaptionConfig,
|
|
338
|
+
DiarizationConfig,
|
|
339
|
+
MediaConfig,
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
client = LattifAI(
|
|
343
|
+
client_config=ClientConfig(api_key="lf_xxx", timeout=60.0),
|
|
344
|
+
alignment_config=AlignmentConfig(device="cuda"),
|
|
345
|
+
caption_config=CaptionConfig(split_sentence=True, word_level=True),
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
caption = client.alignment(
|
|
349
|
+
input_media="audio.wav",
|
|
350
|
+
input_caption="subtitle.srt",
|
|
351
|
+
output_caption_path="output.json",
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
# Access results
|
|
355
|
+
for segment in caption.supervisions:
|
|
356
|
+
print(f"{segment.start:.2f}s - {segment.end:.2f}s: {segment.text}")
|
|
357
|
+
```
|
|
358
|
+
|
|
359
|
+
### YouTube Processing
|
|
360
|
+
|
|
361
|
+
```python
|
|
362
|
+
caption = client.youtube(
|
|
363
|
+
url="https://youtube.com/watch?v=VIDEO_ID",
|
|
364
|
+
output_dir="./downloads",
|
|
365
|
+
output_caption_path="aligned.srt",
|
|
366
|
+
)
|
|
367
|
+
```
|
|
368
|
+
|
|
369
|
+
### CaptionConfig Options
|
|
370
|
+
|
|
371
|
+
| Option | Default | Description |
|
|
372
|
+
|--------|---------|-------------|
|
|
373
|
+
| `split_sentence` | `False` | Smart sentence splitting, separates non-speech elements |
|
|
374
|
+
| `word_level` | `False` | Include word-level timestamps in output |
|
|
375
|
+
| `normalize_text` | `True` | Clean HTML entities and special characters |
|
|
376
|
+
| `include_speaker_in_text` | `True` | Include speaker labels in text output |
|
|
377
|
+
|
|
378
|
+
```python
|
|
379
|
+
from lattifai.client import LattifAI
|
|
380
|
+
from lattifai.config import CaptionConfig
|
|
381
|
+
|
|
382
|
+
client = LattifAI(
|
|
383
|
+
caption_config=CaptionConfig(
|
|
384
|
+
split_sentence=True,
|
|
385
|
+
word_level=True,
|
|
386
|
+
normalize_text=True,
|
|
387
|
+
include_speaker_in_text=False,
|
|
388
|
+
)
|
|
389
|
+
)
|
|
390
|
+
```
|
|
391
|
+
|
|
392
|
+
---
|
|
393
|
+
|
|
394
|
+
## Advanced Features
|
|
395
|
+
|
|
396
|
+
### Streaming Mode (Long Audio)
|
|
397
|
+
|
|
398
|
+
Process audio up to 20 hours with minimal memory:
|
|
399
|
+
|
|
400
|
+
```python
|
|
401
|
+
caption = client.alignment(
|
|
402
|
+
input_media="long_audio.wav",
|
|
403
|
+
input_caption="subtitle.srt",
|
|
404
|
+
streaming_chunk_secs=600.0, # 10-minute chunks
|
|
405
|
+
)
|
|
406
|
+
```
|
|
407
|
+
|
|
408
|
+
### Word-Level Alignment
|
|
409
|
+
|
|
410
|
+
```python
|
|
411
|
+
from lattifai.client import LattifAI
|
|
412
|
+
from lattifai.config import CaptionConfig
|
|
413
|
+
|
|
414
|
+
client = LattifAI(caption_config=CaptionConfig(word_level=True))
|
|
415
|
+
caption = client.alignment(
|
|
416
|
+
input_media="audio.wav",
|
|
417
|
+
input_caption="subtitle.srt",
|
|
418
|
+
output_caption_path="output.json", # JSON preserves word-level data
|
|
419
|
+
)
|
|
420
|
+
```
|
|
421
|
+
|
|
422
|
+
### Speaker Diarization
|
|
423
|
+
|
|
424
|
+
Automatically identify and label different speakers in audio.
|
|
425
|
+
|
|
426
|
+
**Capabilities:**
|
|
427
|
+
- **Multi-Speaker Detection**: Automatically detect speaker changes
|
|
428
|
+
- **Smart Labeling**: Assign labels (SPEAKER_00, SPEAKER_01, etc.)
|
|
429
|
+
- **Label Preservation**: Maintain existing speaker names from input captions
|
|
430
|
+
- **Gemini Integration**: Extract speaker names from transcription context
|
|
431
|
+
|
|
432
|
+
**Label Handling:**
|
|
433
|
+
- Without existing labels → Generic labels (SPEAKER_00, SPEAKER_01)
|
|
434
|
+
- With existing labels (`[Alice]`, `>> Bob:`, `SPEAKER_01:`) → Preserved during alignment
|
|
435
|
+
- Gemini transcription → Names extracted from context (e.g., "Hi, I'm Alice" → `Alice`)
|
|
436
|
+
|
|
437
|
+
```python
|
|
438
|
+
from lattifai.client import LattifAI
|
|
439
|
+
from lattifai.config import DiarizationConfig
|
|
440
|
+
|
|
441
|
+
client = LattifAI(
|
|
442
|
+
diarization_config=DiarizationConfig(
|
|
443
|
+
enabled=True,
|
|
444
|
+
device="cuda",
|
|
445
|
+
min_speakers=2,
|
|
446
|
+
max_speakers=4,
|
|
447
|
+
)
|
|
448
|
+
)
|
|
449
|
+
caption = client.alignment(...)
|
|
450
|
+
|
|
451
|
+
for segment in caption.supervisions:
|
|
452
|
+
print(f"[{segment.speaker}] {segment.text}")
|
|
453
|
+
```
|
|
454
|
+
|
|
455
|
+
**CLI:**
|
|
456
|
+
```bash
|
|
457
|
+
lai alignment align audio.wav subtitle.srt output.srt \
|
|
458
|
+
diarization.enabled=true \
|
|
459
|
+
diarization.device=cuda
|
|
460
|
+
```
|
|
461
|
+
|
|
462
|
+
### Data Flow
|
|
463
|
+
|
|
464
|
+
```
|
|
465
|
+
Input Media → AudioLoader → Aligner → (Diarizer) → Caption
|
|
466
|
+
↑
|
|
467
|
+
Input Caption → Reader → Tokenizer
|
|
468
|
+
```
|
|
469
|
+
|
|
470
|
+
---
|
|
471
|
+
|
|
472
|
+
## Text Processing
|
|
473
|
+
|
|
474
|
+
The tokenizer handles various text patterns for forced alignment.
|
|
475
|
+
|
|
476
|
+
### Bracket/Caption Handling
|
|
477
|
+
|
|
478
|
+
Visual captions and annotations in brackets are treated specially - they get **two pronunciation paths** so the aligner can choose:
|
|
479
|
+
1. **Silence path** - skip when content doesn't appear in audio
|
|
480
|
+
2. **Inner text pronunciation** - match if someone actually says the words
|
|
481
|
+
|
|
482
|
+
| Bracket Type | Symbol | Example | Alignment Behavior |
|
|
483
|
+
|--------------|--------|---------|-------------------|
|
|
484
|
+
| Half-width square | `[]` | `[APPLAUSE]` | Skip or match "applause" |
|
|
485
|
+
| Half-width paren | `()` | `(music)` | Skip or match "music" |
|
|
486
|
+
| Full-width square | `【】` | `【笑声】` | Skip or match "笑声" |
|
|
487
|
+
| Full-width paren | `()` | `(音乐)` | Skip or match "音乐" |
|
|
488
|
+
| Angle brackets | `<>` | `<intro>` | Skip or match "intro" |
|
|
489
|
+
| Book title marks | `《》` | `《开场白》` | Skip or match "开场白" |
|
|
490
|
+
|
|
491
|
+
This allows proper handling of:
|
|
492
|
+
- **Visual descriptions**: `[Barret adjusts the camera and smiles]` → skipped if not spoken
|
|
493
|
+
- **Sound effects**: `[APPLAUSE]`, `(music)` → matched if audible
|
|
494
|
+
- **Chinese annotations**: `【笑声】`, `(鼓掌)` → flexible alignment
|
|
495
|
+
|
|
496
|
+
### Multilingual Text
|
|
497
|
+
|
|
498
|
+
| Pattern | Handling | Example |
|
|
499
|
+
|---------|----------|---------|
|
|
500
|
+
| CJK characters | Split individually | `你好` → `["你", "好"]` |
|
|
501
|
+
| Latin words | Grouped with accents | `Kühlschrank` → `["Kühlschrank"]` |
|
|
502
|
+
| Contractions | Kept together | `I'm`, `don't`, `we'll` |
|
|
503
|
+
| Punctuation | Attached to words | `Hello,` `world!` |
|
|
504
|
+
|
|
505
|
+
### Speaker Labels
|
|
506
|
+
|
|
507
|
+
Recognized speaker patterns are preserved during alignment:
|
|
508
|
+
|
|
509
|
+
| Format | Example | Output |
|
|
510
|
+
|--------|---------|--------|
|
|
511
|
+
| Arrow prefix | `>> Alice:` or `>> Alice:` | `[Alice]` |
|
|
512
|
+
| LattifAI format | `[SPEAKER_01]:` | `[SPEAKER_01]` |
|
|
513
|
+
| Uppercase name | `SPEAKER NAME:` | `[SPEAKER NAME]` |
|
|
514
|
+
|
|
515
|
+
---
|
|
516
|
+
|
|
517
|
+
## Supported Formats & Languages
|
|
518
|
+
|
|
519
|
+
### Media Formats
|
|
520
|
+
|
|
521
|
+
| Type | Formats |
|
|
522
|
+
|------|---------|
|
|
523
|
+
| **Audio** | WAV, MP3, M4A, AAC, FLAC, OGG, OPUS, AIFF, and more |
|
|
524
|
+
| **Video** | MP4, MKV, MOV, WEBM, AVI, and more |
|
|
525
|
+
| **Caption** | SRT, VTT, ASS, SSA, SRV3, JSON, TextGrid, TSV, CSV, LRC, TTML, and more |
|
|
526
|
+
|
|
527
|
+
> **Note**: Caption format handling is provided by [lattifai-captions](https://github.com/lattifai/captions), which is automatically installed as a dependency. For standalone caption processing without alignment features, install `pip install lattifai-captions`.
|
|
528
|
+
|
|
529
|
+
### JSON Format
|
|
530
|
+
|
|
531
|
+
JSON is the most flexible format for storing caption data with full word-level timing support:
|
|
532
|
+
|
|
533
|
+
```json
|
|
534
|
+
[
|
|
535
|
+
{
|
|
536
|
+
"text": "Hello beautiful world",
|
|
537
|
+
"start": 0.0,
|
|
538
|
+
"end": 2.5,
|
|
539
|
+
"speaker": "Speaker 1",
|
|
540
|
+
"words": [
|
|
541
|
+
{"word": "Hello", "start": 0.0, "end": 0.5},
|
|
542
|
+
{"word": "beautiful", "start": 0.6, "end": 1.4},
|
|
543
|
+
{"word": "world", "start": 1.5, "end": 2.5}
|
|
544
|
+
]
|
|
545
|
+
}
|
|
546
|
+
]
|
|
547
|
+
```
|
|
548
|
+
|
|
549
|
+
**Features:**
|
|
550
|
+
- Word-level timestamps preserved in `words` array
|
|
551
|
+
- Round-trip compatible (read/write without data loss)
|
|
552
|
+
- Optional `speaker` field for multi-speaker content
|
|
553
|
+
|
|
554
|
+
### Word-Level and Karaoke Output
|
|
555
|
+
|
|
556
|
+
| Format | `word_level=True` | `word_level=True` + `karaoke=True` |
|
|
557
|
+
|--------|-------------------|-----------------------------------|
|
|
558
|
+
| **JSON** | Includes `words` array | Same as word_level=True |
|
|
559
|
+
| **SRT** | One word per segment | One word per segment |
|
|
560
|
+
| **VTT** | One word per segment | YouTube VTT style: `<00:00:00.000><c> word</c>` |
|
|
561
|
+
| **ASS** | One word per segment | `{\kf}` karaoke tags (sweep effect) |
|
|
562
|
+
| **LRC** | One word per line | Enhanced `<timestamp>` tags |
|
|
563
|
+
| **TTML** | One word per `<p>` element | `<span>` with `itunes:timing="Word"` |
|
|
564
|
+
|
|
565
|
+
### VTT Format (YouTube VTT Support)
|
|
566
|
+
|
|
567
|
+
The VTT format handler supports both standard WebVTT and YouTube VTT with word-level timestamps.
|
|
568
|
+
|
|
569
|
+
**Reading**: VTT automatically detects YouTube VTT format (with `<timestamp><c>` tags) and extracts word-level alignment data:
|
|
570
|
+
|
|
571
|
+
```
|
|
572
|
+
WEBVTT
|
|
573
|
+
|
|
574
|
+
00:00:00.000 --> 00:00:02.000
|
|
575
|
+
<00:00:00.000><c> Hello</c><00:00:00.500><c> world</c>
|
|
576
|
+
```
|
|
577
|
+
|
|
578
|
+
**Writing**: Use `word_level=True` with `karaoke_config` to output YouTube VTT style:
|
|
579
|
+
|
|
580
|
+
```python
|
|
581
|
+
from lattifai.caption import Caption
|
|
582
|
+
from lattifai.caption.config import KaraokeConfig
|
|
583
|
+
|
|
584
|
+
caption = Caption.read("input.vtt")
|
|
585
|
+
caption.write(
|
|
586
|
+
"output.vtt",
|
|
587
|
+
word_level=True,
|
|
588
|
+
karaoke_config=KaraokeConfig(enabled=True)
|
|
589
|
+
)
|
|
590
|
+
```
|
|
591
|
+
|
|
592
|
+
```bash
|
|
593
|
+
# CLI: Convert to YouTube VTT with word-level timestamps
|
|
594
|
+
lai caption convert input.json output.vtt \
|
|
595
|
+
caption.word_level=true \
|
|
596
|
+
caption.karaoke.enabled=true
|
|
597
|
+
```
|
|
598
|
+
|
|
599
|
+
### Transcription Language Support
|
|
600
|
+
|
|
601
|
+
#### Gemini Models (100+ Languages)
|
|
602
|
+
|
|
603
|
+
**Models**: `gemini-2.5-pro`, `gemini-3-pro-preview`, `gemini-3-flash-preview`
|
|
604
|
+
|
|
605
|
+
English, Chinese (Mandarin & Cantonese), Spanish, French, German, Italian, Portuguese, Japanese, Korean, Arabic, Russian, Hindi, Bengali, Turkish, Dutch, Polish, Swedish, Danish, Norwegian, Finnish, Greek, Hebrew, Thai, Vietnamese, Indonesian, Malay, Filipino, Ukrainian, Czech, Romanian, Hungarian, and 70+ more.
|
|
606
|
+
|
|
607
|
+
> Requires Gemini API key from [Google AI Studio](https://aistudio.google.com/apikey)
|
|
608
|
+
|
|
609
|
+
#### NVIDIA Parakeet (24 European Languages)
|
|
610
|
+
|
|
611
|
+
**Model**: `nvidia/parakeet-tdt-0.6b-v3`
|
|
612
|
+
|
|
613
|
+
| Region | Languages |
|
|
614
|
+
|--------|-----------|
|
|
615
|
+
| Western Europe | English (en), French (fr), German (de), Spanish (es), Italian (it), Portuguese (pt), Dutch (nl) |
|
|
616
|
+
| Nordic | Danish (da), Swedish (sv), Norwegian (no), Finnish (fi) |
|
|
617
|
+
| Eastern Europe | Polish (pl), Czech (cs), Slovak (sk), Hungarian (hu), Romanian (ro), Bulgarian (bg), Ukrainian (uk), Russian (ru) |
|
|
618
|
+
| Others | Croatian (hr), Estonian (et), Latvian (lv), Lithuanian (lt), Slovenian (sl), Maltese (mt), Greek (el) |
|
|
619
|
+
|
|
620
|
+
#### Alibaba SenseVoice (5 Asian Languages)
|
|
621
|
+
|
|
622
|
+
**Model**: `iic/SenseVoiceSmall`
|
|
623
|
+
|
|
624
|
+
Chinese/Mandarin (zh), English (en), Japanese (ja), Korean (ko), Cantonese (yue)
|
|
625
|
+
|
|
626
|
+
---
|
|
627
|
+
|
|
628
|
+
## Roadmap
|
|
629
|
+
|
|
630
|
+
Visit [lattifai.com/roadmap](https://lattifai.com/roadmap) for updates.
|
|
631
|
+
|
|
632
|
+
| Date | Release | Features |
|
|
633
|
+
|------|---------|----------|
|
|
634
|
+
| **Oct 2025** | Lattice-1-Alpha | ✅ English forced alignment, multi-format support |
|
|
635
|
+
| **Nov 2025** | Lattice-1 | ✅ EN+ZH+DE, speaker diarization, multi-model transcription |
|
|
636
|
+
| **Q1 2026** | Lattice-2 | ✅ Streaming mode, 🔮 40+ languages, real-time alignment |
|
|
637
|
+
|
|
638
|
+
---
|
|
639
|
+
|
|
640
|
+
## Development
|
|
641
|
+
|
|
642
|
+
```bash
|
|
643
|
+
git clone https://github.com/lattifai/lattifai-python.git
|
|
644
|
+
cd lattifai-python
|
|
645
|
+
|
|
646
|
+
# Using uv (recommended)
|
|
647
|
+
uv sync && source .venv/bin/activate
|
|
648
|
+
|
|
649
|
+
# Or pip
|
|
650
|
+
pip install -e ".[all,dev]"
|
|
651
|
+
|
|
652
|
+
# Run tests
|
|
653
|
+
pytest
|
|
654
|
+
|
|
655
|
+
# Install pre-commit hooks
|
|
656
|
+
pre-commit install
|
|
657
|
+
```
|
|
658
|
+
|
|
659
|
+
## Contributing
|
|
660
|
+
|
|
661
|
+
1. Fork the repository
|
|
662
|
+
2. Create a feature branch (`git checkout -b feature/amazing-feature`)
|
|
663
|
+
3. Make changes and add tests
|
|
664
|
+
4. Run `pytest` and `pre-commit run --all-files`
|
|
665
|
+
5. Commit your changes (`git commit -m 'Add amazing feature'`)
|
|
666
|
+
6. Push to branch (`git push origin feature/amazing-feature`)
|
|
667
|
+
7. Open a Pull Request
|
|
668
|
+
|
|
669
|
+
---
|
|
670
|
+
|
|
671
|
+
## Support
|
|
672
|
+
|
|
673
|
+
- **Issues**: [GitHub Issues](https://github.com/lattifai/lattifai-python/issues)
|
|
674
|
+
- **Discord**: [Join our community](https://discord.gg/kvF4WsBRK8)
|
|
675
|
+
|
|
676
|
+
## License
|
|
677
|
+
|
|
678
|
+
Apache License 2.0
|