lattifai 0.4.5__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/__init__.py +61 -47
- lattifai/alignment/__init__.py +6 -0
- lattifai/alignment/lattice1_aligner.py +119 -0
- lattifai/alignment/lattice1_worker.py +185 -0
- lattifai/{tokenizer → alignment}/phonemizer.py +4 -4
- lattifai/alignment/segmenter.py +166 -0
- lattifai/{tokenizer → alignment}/tokenizer.py +244 -169
- lattifai/audio2.py +211 -0
- lattifai/caption/__init__.py +20 -0
- lattifai/caption/caption.py +1275 -0
- lattifai/{io → caption}/gemini_reader.py +30 -30
- lattifai/{io → caption}/gemini_writer.py +17 -17
- lattifai/{io → caption}/supervision.py +4 -3
- lattifai/caption/text_parser.py +145 -0
- lattifai/cli/__init__.py +17 -0
- lattifai/cli/alignment.py +153 -0
- lattifai/cli/caption.py +204 -0
- lattifai/cli/server.py +19 -0
- lattifai/cli/transcribe.py +197 -0
- lattifai/cli/youtube.py +128 -0
- lattifai/client.py +460 -251
- lattifai/config/__init__.py +20 -0
- lattifai/config/alignment.py +73 -0
- lattifai/config/caption.py +178 -0
- lattifai/config/client.py +46 -0
- lattifai/config/diarization.py +67 -0
- lattifai/config/media.py +335 -0
- lattifai/config/transcription.py +84 -0
- lattifai/diarization/__init__.py +5 -0
- lattifai/diarization/lattifai.py +89 -0
- lattifai/errors.py +98 -91
- lattifai/logging.py +116 -0
- lattifai/mixin.py +552 -0
- lattifai/server/app.py +420 -0
- lattifai/transcription/__init__.py +76 -0
- lattifai/transcription/base.py +108 -0
- lattifai/transcription/gemini.py +219 -0
- lattifai/transcription/lattifai.py +103 -0
- lattifai/{workflows → transcription}/prompts/__init__.py +4 -4
- lattifai/types.py +30 -0
- lattifai/utils.py +16 -44
- lattifai/workflow/__init__.py +22 -0
- lattifai/workflow/agents.py +6 -0
- lattifai/{workflows → workflow}/base.py +22 -22
- lattifai/{workflows → workflow}/file_manager.py +239 -215
- lattifai/workflow/youtube.py +564 -0
- lattifai-1.0.0.dist-info/METADATA +736 -0
- lattifai-1.0.0.dist-info/RECORD +52 -0
- {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/WHEEL +1 -1
- lattifai-1.0.0.dist-info/entry_points.txt +13 -0
- {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/licenses/LICENSE +1 -1
- lattifai/base_client.py +0 -126
- lattifai/bin/__init__.py +0 -3
- lattifai/bin/agent.py +0 -325
- lattifai/bin/align.py +0 -296
- lattifai/bin/cli_base.py +0 -25
- lattifai/bin/subtitle.py +0 -210
- lattifai/io/__init__.py +0 -42
- lattifai/io/reader.py +0 -85
- lattifai/io/text_parser.py +0 -75
- lattifai/io/utils.py +0 -15
- lattifai/io/writer.py +0 -90
- lattifai/tokenizer/__init__.py +0 -3
- lattifai/workers/__init__.py +0 -3
- lattifai/workers/lattice1_alpha.py +0 -284
- lattifai/workflows/__init__.py +0 -34
- lattifai/workflows/agents.py +0 -10
- lattifai/workflows/gemini.py +0 -167
- lattifai/workflows/prompts/README.md +0 -22
- lattifai/workflows/prompts/gemini/README.md +0 -24
- lattifai/workflows/prompts/gemini/transcription_gem.txt +0 -81
- lattifai/workflows/youtube.py +0 -931
- lattifai-0.4.5.dist-info/METADATA +0 -808
- lattifai-0.4.5.dist-info/RECORD +0 -39
- lattifai-0.4.5.dist-info/entry_points.txt +0 -3
- {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,808 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: lattifai
|
|
3
|
-
Version: 0.4.5
|
|
4
|
-
Summary: Lattifai Python SDK: Seamless Integration with Lattifai's Speech and Video AI Services
|
|
5
|
-
Author-email: Lattifai Technologies <tech@lattifai.com>
|
|
6
|
-
Maintainer-email: Lattice <tech@lattifai.com>
|
|
7
|
-
License: MIT License
|
|
8
|
-
|
|
9
|
-
Copyright (c) 2025 Lattifai.
|
|
10
|
-
|
|
11
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
12
|
-
of this software and associated documentation files (the "Software"), to deal
|
|
13
|
-
in the Software without restriction, including without limitation the rights
|
|
14
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
15
|
-
copies of the Software, and to permit persons to whom the Software is
|
|
16
|
-
furnished to do so, subject to the following conditions:
|
|
17
|
-
|
|
18
|
-
The above copyright notice and this permission notice shall be included in all
|
|
19
|
-
copies or substantial portions of the Software.
|
|
20
|
-
|
|
21
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
22
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
23
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
24
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
25
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
26
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
27
|
-
SOFTWARE.
|
|
28
|
-
Project-URL: Homepage, https://github.com/lattifai/lattifai-python
|
|
29
|
-
Project-URL: Documentation, https://github.com/lattifai/lattifai-python/README.md
|
|
30
|
-
Project-URL: Bug Tracker, https://github.com/lattifai/lattifai-python/issues
|
|
31
|
-
Project-URL: Discussions, https://github.com/lattifai/lattifai-python/discussions
|
|
32
|
-
Project-URL: Changelog, https://github.com/lattifai/lattifai-python/CHANGELOG.md
|
|
33
|
-
Keywords: lattifai,speech recognition,video analysis,ai,sdk,api client
|
|
34
|
-
Classifier: Development Status :: 5 - Production/Stable
|
|
35
|
-
Classifier: Intended Audience :: Developers
|
|
36
|
-
Classifier: Intended Audience :: Science/Research
|
|
37
|
-
Classifier: License :: OSI Approved :: Apache Software License
|
|
38
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
39
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
40
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
41
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
42
|
-
Classifier: Operating System :: MacOS :: MacOS X
|
|
43
|
-
Classifier: Operating System :: POSIX :: Linux
|
|
44
|
-
Classifier: Operating System :: Microsoft :: Windows
|
|
45
|
-
Classifier: Topic :: Multimedia :: Sound/Audio
|
|
46
|
-
Classifier: Topic :: Multimedia :: Video
|
|
47
|
-
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
48
|
-
Requires-Python: <3.14,>=3.10
|
|
49
|
-
Description-Content-Type: text/markdown
|
|
50
|
-
License-File: LICENSE
|
|
51
|
-
Requires-Dist: lattifai-core>=0.2.1
|
|
52
|
-
Requires-Dist: httpx
|
|
53
|
-
Requires-Dist: python-dotenv
|
|
54
|
-
Requires-Dist: lhotse>=1.26.0
|
|
55
|
-
Requires-Dist: colorful>=0.5.6
|
|
56
|
-
Requires-Dist: pysubs2
|
|
57
|
-
Requires-Dist: praatio
|
|
58
|
-
Requires-Dist: tgt
|
|
59
|
-
Requires-Dist: onnxruntime
|
|
60
|
-
Requires-Dist: g2p-phonemizer==0.1.1
|
|
61
|
-
Requires-Dist: wtpsplit>=2.1.6
|
|
62
|
-
Requires-Dist: av
|
|
63
|
-
Requires-Dist: questionary>=2.0
|
|
64
|
-
Requires-Dist: yt-dlp
|
|
65
|
-
Requires-Dist: pycryptodome
|
|
66
|
-
Requires-Dist: google-genai
|
|
67
|
-
Provides-Extra: numpy
|
|
68
|
-
Requires-Dist: numpy; extra == "numpy"
|
|
69
|
-
Provides-Extra: test
|
|
70
|
-
Requires-Dist: pytest; extra == "test"
|
|
71
|
-
Requires-Dist: pytest-cov; extra == "test"
|
|
72
|
-
Requires-Dist: pytest-asyncio; extra == "test"
|
|
73
|
-
Requires-Dist: ruff; extra == "test"
|
|
74
|
-
Requires-Dist: numpy; extra == "test"
|
|
75
|
-
Provides-Extra: all
|
|
76
|
-
Requires-Dist: numpy; extra == "all"
|
|
77
|
-
Requires-Dist: pytest; extra == "all"
|
|
78
|
-
Requires-Dist: pytest-cov; extra == "all"
|
|
79
|
-
Requires-Dist: pytest-asyncio; extra == "all"
|
|
80
|
-
Requires-Dist: ruff; extra == "all"
|
|
81
|
-
Dynamic: license-file
|
|
82
|
-
|
|
83
|
-
<div align="center">
|
|
84
|
-
<img src="https://raw.githubusercontent.com/lattifai/lattifai-python/main/assets/logo.png" width=256>
|
|
85
|
-
|
|
86
|
-
[](https://badge.fury.io/py/lattifai)
|
|
87
|
-
[](https://pypi.org/project/lattifai)
|
|
88
|
-
[](https://pepy.tech/project/lattifai)
|
|
89
|
-
</div>
|
|
90
|
-
|
|
91
|
-
<p align="center">
|
|
92
|
-
🌐 <a href="https://lattifai.com"><b>Official Website</b></a>    |    🖥️ <a href="https://github.com/lattifai/lattifai-python">GitHub</a>    |    🤗 <a href="https://huggingface.co/Lattifai/Lattice-1-Alpha">Model</a>    |    📑 <a href="https://lattifai.com/blogs">Blog</a>    |    <a href="https://discord.gg/kvF4WsBRK8"><img src="https://img.shields.io/badge/Discord-Join-5865F2?logo=discord&logoColor=white" alt="Discord" style="vertical-align: middle;"></a>
|
|
93
|
-
</p>
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
# LattifAI Python
|
|
97
|
-
|
|
98
|
-
Advanced forced alignment and subtitle generation powered by [Lattice-1-Alpha](https://huggingface.co/Lattifai/Lattice-1-Alpha) model.
|
|
99
|
-
|
|
100
|
-
## Installation
|
|
101
|
-
|
|
102
|
-
```bash
|
|
103
|
-
pip install install-k2
|
|
104
|
-
# The installation will automatically detect and use your already installed PyTorch version(up to 2.8).
|
|
105
|
-
install-k2 # Install k2
|
|
106
|
-
|
|
107
|
-
pip install lattifai
|
|
108
|
-
```
|
|
109
|
-
> **⚠️ Important**: You must run `install-k2` before using the lattifai library.
|
|
110
|
-
```
|
|
111
|
-
> install-k2 --help
|
|
112
|
-
usage: install-k2 [-h] [--system {linux,darwin,windows}] [--dry-run] [--torch-version TORCH_VERSION]
|
|
113
|
-
|
|
114
|
-
Auto-install the latest k2 wheel for your environment.
|
|
115
|
-
|
|
116
|
-
optional arguments:
|
|
117
|
-
-h, --help show this help message and exit
|
|
118
|
-
--system {linux,darwin,windows}
|
|
119
|
-
Override OS detection. Valid values: linux, darwin (macOS), windows. Default: auto-detect
|
|
120
|
-
--dry-run Show what would be installed without making changes.
|
|
121
|
-
--torch-version TORCH_VERSION
|
|
122
|
-
Specify torch version (e.g., 2.8.0). If not specified, will auto-detect or use latest available.
|
|
123
|
-
```
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
## Quick Start
|
|
127
|
-
|
|
128
|
-
### Command Line
|
|
129
|
-
|
|
130
|
-
The library provides two equivalent commands: `lai` (recommended for convenience) and `lattifai`.
|
|
131
|
-
|
|
132
|
-
```bash
|
|
133
|
-
# Align audio with subtitle (using lai command)
|
|
134
|
-
lai align audio.wav subtitle.srt output.srt
|
|
135
|
-
# Or use the full command
|
|
136
|
-
lattifai align audio.wav subtitle.srt output.srt
|
|
137
|
-
|
|
138
|
-
# Download and align YouTube content directly
|
|
139
|
-
lai youtube https://www.youtube.com/watch?v=VIDEO_ID
|
|
140
|
-
|
|
141
|
-
# Process YouTube videos with intelligent workflow (equivalent to lai youtube)
|
|
142
|
-
lai agent --youtube https://www.youtube.com/watch?v=VIDEO_ID
|
|
143
|
-
|
|
144
|
-
# Convert subtitle format
|
|
145
|
-
lai subtitle convert input.srt output.vtt
|
|
146
|
-
```
|
|
147
|
-
|
|
148
|
-
> **💡 Tip**: Use `lai` for faster typing in your daily workflow!
|
|
149
|
-
|
|
150
|
-
#### Command Quick Reference
|
|
151
|
-
|
|
152
|
-
| Command | Use Case | Best For |
|
|
153
|
-
|---------|----------|----------|
|
|
154
|
-
| `lai align` | Align existing audio + subtitle files | Local files, custom workflows |
|
|
155
|
-
| `lai youtube` | Download & align YouTube content | Quick one-off YouTube processing |
|
|
156
|
-
| `lai agent` | Intelligent YouTube workflow with retries | Production, batch jobs, automation |
|
|
157
|
-
| `lai subtitle` | Convert subtitle formats | Format conversion only |
|
|
158
|
-
|
|
159
|
-
#### lai align options
|
|
160
|
-
```
|
|
161
|
-
> lai align --help
|
|
162
|
-
Usage: lattifai align [OPTIONS] INPUT_AUDIO_PATH INPUT_SUBTITLE_PATH OUTPUT_SUBTITLE_PATH
|
|
163
|
-
|
|
164
|
-
Command used to align audio with subtitles
|
|
165
|
-
|
|
166
|
-
Options:
|
|
167
|
-
-F, --input_format [srt|vtt|ass|ssa|sub|sbv|txt|auto|gemini] Input subtitle format.
|
|
168
|
-
-S, --split_sentence Re-segment subtitles by semantics.
|
|
169
|
-
-W, --word_level Include word-level alignment timestamps.
|
|
170
|
-
-D, --device [cpu|cuda|mps] Device to use for inference.
|
|
171
|
-
-M, --model_name_or_path TEXT Model name or path for alignment.
|
|
172
|
-
--api_key TEXT API key for LattifAI.
|
|
173
|
-
--help Show this message and exit.
|
|
174
|
-
```
|
|
175
|
-
|
|
176
|
-
#### lai youtube command
|
|
177
|
-
|
|
178
|
-
Download and align YouTube videos in one step. Automatically downloads media, fetches subtitles (or uses Gemini transcription if unavailable), and performs forced alignment.
|
|
179
|
-
|
|
180
|
-
```bash
|
|
181
|
-
# Basic usage
|
|
182
|
-
lai youtube https://www.youtube.com/watch?v=VIDEO_ID
|
|
183
|
-
|
|
184
|
-
# Common options: audio format, sentence splitting, word-level, GPU
|
|
185
|
-
lai youtube --media-format mp3 --split-sentence --word-level --device mps \
|
|
186
|
-
--output-dir ./output --output-format srt https://www.youtube.com/watch?v=VIDEO_ID
|
|
187
|
-
|
|
188
|
-
# Use Gemini for transcription fallback
|
|
189
|
-
# Gemini API Key: Get yours at https://aistudio.google.com/apikey
|
|
190
|
-
# Note: Your API key is completely safe - it's never logged or stored by our codebase
|
|
191
|
-
lai youtube --gemini-api-key YOUR_GEMINI_KEY https://www.youtube.com/watch?v=VIDEO_ID
|
|
192
|
-
```
|
|
193
|
-
|
|
194
|
-
**Options**:
|
|
195
|
-
```
|
|
196
|
-
> lai youtube --help
|
|
197
|
-
Usage: lattifai youtube [OPTIONS] YT_URL
|
|
198
|
-
|
|
199
|
-
Download media and subtitles from YouTube for further alignment.
|
|
200
|
-
|
|
201
|
-
Options:
|
|
202
|
-
-M, --media-format [mp3|wav|m4a|aac|flac|ogg|opus|aiff|mp4|webm|mkv|avi|mov] Media format for YouTube download.
|
|
203
|
-
-S, --split-sentence Re-segment subtitles by semantics.
|
|
204
|
-
-W, --word-level Include word-level alignment timestamps.
|
|
205
|
-
-O, --output-dir PATH Output directory (default: current directory).
|
|
206
|
-
-D, --device [cpu|cuda|mps] Device to use for inference.
|
|
207
|
-
-M, --model-name-or-path TEXT Model name or path for alignment.
|
|
208
|
-
--api-key TEXT API key for LattifAI.
|
|
209
|
-
--gemini-api-key TEXT Gemini API key for transcription fallback.
|
|
210
|
-
-F, --output-format [srt|vtt|ass|ssa|sub|sbv|txt|json|TextGrid] Subtitle output format.
|
|
211
|
-
--help Show this message and exit.
|
|
212
|
-
```
|
|
213
|
-
|
|
214
|
-
#### lai agent command
|
|
215
|
-
|
|
216
|
-
**Intelligent Agentic Workflow** - Process YouTube videos through an advanced multi-step workflow with automatic retries, smart file management, and comprehensive error handling.
|
|
217
|
-
|
|
218
|
-
```bash
|
|
219
|
-
# Basic usage
|
|
220
|
-
lai agent --youtube https://www.youtube.com/watch?v=VIDEO_ID
|
|
221
|
-
|
|
222
|
-
# Production workflow with retries, verbose logging, and force overwrite
|
|
223
|
-
lai agent --youtube --media-format mp4 --output-format TextGrid \
|
|
224
|
-
--split-sentence --word-level --device mps --max-retries 2 --verbose --force \
|
|
225
|
-
--output-dir ./outputs https://www.youtube.com/watch?v=VIDEO_ID
|
|
226
|
-
```
|
|
227
|
-
|
|
228
|
-
**Key Features**:
|
|
229
|
-
- **🔄 Automatic Retry Logic**: Configurable retry mechanism for failed steps
|
|
230
|
-
- **📁 Smart File Management**: Detects existing files and prompts for action
|
|
231
|
-
- **🎯 Intelligent Workflow**: Multi-step pipeline with dependency management
|
|
232
|
-
- **🛡️ Error Recovery**: Graceful handling of failures with detailed logging
|
|
233
|
-
- **📊 Rich Output**: Comprehensive results with metadata and file paths
|
|
234
|
-
- **⚡ Async Processing**: Efficient parallel execution of independent tasks
|
|
235
|
-
|
|
236
|
-
**Options**:
|
|
237
|
-
```
|
|
238
|
-
> lai agent --help
|
|
239
|
-
Usage: lattifai agent [OPTIONS] URL
|
|
240
|
-
|
|
241
|
-
LattifAI Agentic Workflow Agent
|
|
242
|
-
|
|
243
|
-
Process multimedia content through intelligent agent-based pipelines.
|
|
244
|
-
|
|
245
|
-
Options:
|
|
246
|
-
--youtube, --yt Process YouTube URL through agentic workflow.
|
|
247
|
-
--gemini-api-key TEXT Gemini API key for transcription.
|
|
248
|
-
--media-format [mp3|wav|m4a|aac|opus|mp4|webm|mkv|...] Media format for YouTube download.
|
|
249
|
-
--output-format [srt|vtt|ass|ssa|sub|sbv|txt|json|...] Subtitle output format.
|
|
250
|
-
--output-dir PATH Output directory (default: current directory).
|
|
251
|
-
--max-retries INTEGER Maximum retries for failed steps.
|
|
252
|
-
-S, --split-sentence Re-segment subtitles by semantics.
|
|
253
|
-
--word-level Include word-level alignment timestamps.
|
|
254
|
-
--verbose, -v Enable verbose logging.
|
|
255
|
-
--force, -f Force overwrite without confirmation.
|
|
256
|
-
--help Show this message and exit.
|
|
257
|
-
```
|
|
258
|
-
|
|
259
|
-
**When to use `lai agent` vs `lai youtube`**:
|
|
260
|
-
- Both `lai agent --youtube URL` and `lai youtube URL` provide the same core functionality for downloading and aligning YouTube content
|
|
261
|
-
- **Use `lai agent --youtube`**: For production workflows, batch processing, advanced error handling, and when you need retry logic
|
|
262
|
-
- **Use `lai youtube`**: For quick one-off downloads and alignment with minimal overhead
|
|
263
|
-
|
|
264
|
-
#### Understanding --split_sentence
|
|
265
|
-
|
|
266
|
-
The `--split_sentence` option performs intelligent sentence re-splitting based on punctuation and semantic boundaries. This is especially useful when processing subtitles that combine multiple semantic units in a single segment, such as:
|
|
267
|
-
|
|
268
|
-
- **Mixed content**: Non-speech elements (e.g., `[APPLAUSE]`, `[MUSIC]`) followed by actual dialogue
|
|
269
|
-
- **Natural punctuation boundaries**: Colons, periods, and other punctuation marks that indicate semantic breaks
|
|
270
|
-
- **Concatenated phrases**: Multiple distinct utterances joined together without proper separation
|
|
271
|
-
|
|
272
|
-
**Example transformations**:
|
|
273
|
-
```
|
|
274
|
-
Input: "[APPLAUSE] >> MIRA MURATI: Thank you all"
|
|
275
|
-
Output: ["[APPLAUSE]", ">> MIRA MURATI: Thank you all"]
|
|
276
|
-
|
|
277
|
-
Input: "[MUSIC] Welcome back. Today we discuss AI."
|
|
278
|
-
Output: ["[MUSIC]", "Welcome back.", "Today we discuss AI."]
|
|
279
|
-
```
|
|
280
|
-
|
|
281
|
-
This feature helps improve alignment accuracy by:
|
|
282
|
-
1. Respecting punctuation-based semantic boundaries
|
|
283
|
-
2. Separating distinct utterances for more precise timing
|
|
284
|
-
3. Maintaining semantic context for each independent phrase
|
|
285
|
-
|
|
286
|
-
**Usage**:
|
|
287
|
-
```bash
|
|
288
|
-
lai align --split_sentence audio.wav subtitle.srt output.srt
|
|
289
|
-
```
|
|
290
|
-
|
|
291
|
-
#### Understanding --word_level
|
|
292
|
-
|
|
293
|
-
The `--word_level` option enables word-level alignment, providing precise timing information for each individual word in the audio. When enabled, the output includes detailed word boundaries within each subtitle segment, allowing for fine-grained synchronization and analysis.
|
|
294
|
-
|
|
295
|
-
**Key features**:
|
|
296
|
-
- **Individual word timestamps**: Each word gets its own start and end time
|
|
297
|
-
- **Format-specific output**:
|
|
298
|
-
- **JSON (Recommended)**: Full alignment details stored in `alignment.word` field of each segment, preserving all word-level timing information in a structured format
|
|
299
|
-
- **TextGrid**: Separate "words" tier alongside the "utterances" tier for linguistic analysis
|
|
300
|
-
- **TXT**: Each word on a separate line with timestamp range: `[start-end] word`
|
|
301
|
-
- **Standard subtitle formats** (SRT, VTT, ASS, etc.): Each word becomes a separate subtitle event
|
|
302
|
-
|
|
303
|
-
> **💡 Recommended**: Use JSON format (`output.json`) to preserve complete word-level alignment data. Other formats may lose some structural information.
|
|
304
|
-
|
|
305
|
-
**Example output formats**:
|
|
306
|
-
|
|
307
|
-
**JSON format** (with word-level details):
|
|
308
|
-
```json
|
|
309
|
-
[
|
|
310
|
-
{
|
|
311
|
-
"id": "6",
|
|
312
|
-
"recording_id": "",
|
|
313
|
-
"start": 24.52,
|
|
314
|
-
"duration": 9.1,
|
|
315
|
-
"channel": 0,
|
|
316
|
-
"text": "We will start with why it is so important to us to have a product that we can make truly available and broadly available to everyone.",
|
|
317
|
-
"custom": {
|
|
318
|
-
"score": 0.8754
|
|
319
|
-
},
|
|
320
|
-
"alignment": {
|
|
321
|
-
"word": [
|
|
322
|
-
[
|
|
323
|
-
"We",
|
|
324
|
-
24.6,
|
|
325
|
-
0.14,
|
|
326
|
-
1.0
|
|
327
|
-
],
|
|
328
|
-
[
|
|
329
|
-
"will",
|
|
330
|
-
24.74,
|
|
331
|
-
0.14,
|
|
332
|
-
1.0
|
|
333
|
-
],
|
|
334
|
-
[
|
|
335
|
-
"start",
|
|
336
|
-
24.88,
|
|
337
|
-
0.46,
|
|
338
|
-
0.771
|
|
339
|
-
],
|
|
340
|
-
[
|
|
341
|
-
"with",
|
|
342
|
-
25.34,
|
|
343
|
-
0.28,
|
|
344
|
-
0.9538
|
|
345
|
-
],
|
|
346
|
-
[
|
|
347
|
-
"why",
|
|
348
|
-
26.2,
|
|
349
|
-
0.36,
|
|
350
|
-
1.0
|
|
351
|
-
],
|
|
352
|
-
[
|
|
353
|
-
"it",
|
|
354
|
-
26.56,
|
|
355
|
-
0.14,
|
|
356
|
-
0.9726
|
|
357
|
-
],
|
|
358
|
-
[
|
|
359
|
-
"is",
|
|
360
|
-
26.74,
|
|
361
|
-
0.02,
|
|
362
|
-
0.6245
|
|
363
|
-
],
|
|
364
|
-
[
|
|
365
|
-
"so",
|
|
366
|
-
26.76,
|
|
367
|
-
0.16,
|
|
368
|
-
0.6615
|
|
369
|
-
],
|
|
370
|
-
[
|
|
371
|
-
"important",
|
|
372
|
-
26.92,
|
|
373
|
-
0.54,
|
|
374
|
-
0.9257
|
|
375
|
-
],
|
|
376
|
-
[
|
|
377
|
-
"to",
|
|
378
|
-
27.5,
|
|
379
|
-
0.1,
|
|
380
|
-
1.0
|
|
381
|
-
],
|
|
382
|
-
[
|
|
383
|
-
"us",
|
|
384
|
-
27.6,
|
|
385
|
-
0.34,
|
|
386
|
-
0.7955
|
|
387
|
-
],
|
|
388
|
-
[
|
|
389
|
-
"to",
|
|
390
|
-
28.04,
|
|
391
|
-
0.08,
|
|
392
|
-
0.8545
|
|
393
|
-
],
|
|
394
|
-
[
|
|
395
|
-
"have",
|
|
396
|
-
28.16,
|
|
397
|
-
0.46,
|
|
398
|
-
0.9994
|
|
399
|
-
],
|
|
400
|
-
[
|
|
401
|
-
"a",
|
|
402
|
-
28.76,
|
|
403
|
-
0.06,
|
|
404
|
-
1.0
|
|
405
|
-
],
|
|
406
|
-
[
|
|
407
|
-
"product",
|
|
408
|
-
28.82,
|
|
409
|
-
0.56,
|
|
410
|
-
0.9975
|
|
411
|
-
],
|
|
412
|
-
[
|
|
413
|
-
"that",
|
|
414
|
-
29.38,
|
|
415
|
-
0.08,
|
|
416
|
-
0.5602
|
|
417
|
-
],
|
|
418
|
-
[
|
|
419
|
-
"we",
|
|
420
|
-
29.46,
|
|
421
|
-
0.16,
|
|
422
|
-
0.7017
|
|
423
|
-
],
|
|
424
|
-
[
|
|
425
|
-
"can",
|
|
426
|
-
29.62,
|
|
427
|
-
0.22,
|
|
428
|
-
1.0
|
|
429
|
-
],
|
|
430
|
-
[
|
|
431
|
-
"make",
|
|
432
|
-
29.84,
|
|
433
|
-
0.32,
|
|
434
|
-
0.9643
|
|
435
|
-
],
|
|
436
|
-
[
|
|
437
|
-
"truly",
|
|
438
|
-
30.42,
|
|
439
|
-
0.32,
|
|
440
|
-
0.6737
|
|
441
|
-
],
|
|
442
|
-
[
|
|
443
|
-
"available",
|
|
444
|
-
30.74,
|
|
445
|
-
0.6,
|
|
446
|
-
0.9349
|
|
447
|
-
],
|
|
448
|
-
[
|
|
449
|
-
"and",
|
|
450
|
-
31.4,
|
|
451
|
-
0.2,
|
|
452
|
-
0.4114
|
|
453
|
-
],
|
|
454
|
-
[
|
|
455
|
-
"broadly",
|
|
456
|
-
31.6,
|
|
457
|
-
0.44,
|
|
458
|
-
0.6726
|
|
459
|
-
],
|
|
460
|
-
[
|
|
461
|
-
"available",
|
|
462
|
-
32.04,
|
|
463
|
-
0.58,
|
|
464
|
-
0.9108
|
|
465
|
-
],
|
|
466
|
-
[
|
|
467
|
-
"to",
|
|
468
|
-
32.72,
|
|
469
|
-
0.06,
|
|
470
|
-
1.0
|
|
471
|
-
],
|
|
472
|
-
[
|
|
473
|
-
"everyone.",
|
|
474
|
-
32.78,
|
|
475
|
-
0.64,
|
|
476
|
-
0.7886
|
|
477
|
-
]
|
|
478
|
-
]
|
|
479
|
-
}
|
|
480
|
-
}
|
|
481
|
-
]
|
|
482
|
-
```
|
|
483
|
-
|
|
484
|
-
**TXT format** (word-level):
|
|
485
|
-
```
|
|
486
|
-
[0.50-1.20] Hello
|
|
487
|
-
[1.20-2.30] world
|
|
488
|
-
```
|
|
489
|
-
|
|
490
|
-
**TextGrid format** (Praat-compatible):
|
|
491
|
-
```
|
|
492
|
-
Two tiers created:
|
|
493
|
-
- "utterances" tier: Full segments with original text
|
|
494
|
-
- "words" tier: Individual words with precise boundaries
|
|
495
|
-
```
|
|
496
|
-
|
|
497
|
-
**Use cases**:
|
|
498
|
-
- **Linguistic analysis**: Study pronunciation patterns, speech timing, and prosody
|
|
499
|
-
- **Accessibility**: Create more granular captions for hearing-impaired users
|
|
500
|
-
- **Video/Audio editing**: Enable precise word-level subtitle synchronization
|
|
501
|
-
- **Karaoke applications**: Highlight individual words as they are spoken
|
|
502
|
-
- **Language learning**: Provide precise word boundaries for pronunciation practice
|
|
503
|
-
|
|
504
|
-
**Usage**:
|
|
505
|
-
```bash
|
|
506
|
-
# Generate word-level aligned JSON
|
|
507
|
-
lai align --word_level audio.wav subtitle.srt output.json
|
|
508
|
-
|
|
509
|
-
# Create TextGrid file for Praat analysis
|
|
510
|
-
lai align --word_level audio.wav subtitle.srt output.TextGrid
|
|
511
|
-
|
|
512
|
-
# Word-level TXT output
|
|
513
|
-
lai align --word_level audio.wav subtitle.srt output.txt
|
|
514
|
-
|
|
515
|
-
# Standard subtitle with word-level events
|
|
516
|
-
lai align --word_level audio.wav subtitle.srt output.srt
|
|
517
|
-
```
|
|
518
|
-
|
|
519
|
-
**Combined with --split_sentence**:
|
|
520
|
-
```bash
|
|
521
|
-
# Optimal alignment: semantic splitting + word-level details
|
|
522
|
-
lai align --split_sentence --word_level audio.wav subtitle.srt output.json
|
|
523
|
-
```
|
|
524
|
-
|
|
525
|
-
### Python API
|
|
526
|
-
|
|
527
|
-
```python
|
|
528
|
-
from lattifai import LattifAI
|
|
529
|
-
|
|
530
|
-
client = LattifAI() # api_key will be read from LATTIFAI_API_KEY if not provided
|
|
531
|
-
alignments, output_path = client.alignment(
|
|
532
|
-
audio="audio.wav",
|
|
533
|
-
subtitle="subtitle.srt",
|
|
534
|
-
output_subtitle_path="output.srt",
|
|
535
|
-
)
|
|
536
|
-
```
|
|
537
|
-
|
|
538
|
-
Need to run inside an async application? Use the drop-in asynchronous client:
|
|
539
|
-
|
|
540
|
-
```python
|
|
541
|
-
import asyncio
|
|
542
|
-
from lattifai import AsyncLattifAI
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
async def main():
|
|
546
|
-
async with AsyncLattifAI() as client:
|
|
547
|
-
alignments, output_path = await client.alignment(
|
|
548
|
-
audio="audio.wav",
|
|
549
|
-
subtitle="subtitle.srt",
|
|
550
|
-
split_sentence=False,
|
|
551
|
-
output_subtitle_path="output.srt",
|
|
552
|
-
)
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
asyncio.run(main())
|
|
556
|
-
```
|
|
557
|
-
|
|
558
|
-
Both clients return a list of `Supervision` segments with timing information and, if provided, the path where the aligned subtitle was written.
|
|
559
|
-
|
|
560
|
-
## Supported Formats
|
|
561
|
-
|
|
562
|
-
**Audio**: WAV, MP3, M4A, AAC, FLAC, OGG, OPUS, AIFF
|
|
563
|
-
|
|
564
|
-
**Video**: MP4, MKV, MOV, WEBM, AVI
|
|
565
|
-
|
|
566
|
-
**Subtitle Input**: SRT, VTT, ASS, SSA, SUB, SBV, TXT (plain text), Gemini (Google Gemini transcript format)
|
|
567
|
-
|
|
568
|
-
**Subtitle Output**: All input formats plus TextGrid (Praat format for linguistic analysis)
|
|
569
|
-
|
|
570
|
-
## API Reference
|
|
571
|
-
|
|
572
|
-
### LattifAI (sync)
|
|
573
|
-
|
|
574
|
-
```python
|
|
575
|
-
LattifAI(
|
|
576
|
-
api_key: Optional[str] = None,
|
|
577
|
-
model_name_or_path: str = 'Lattifai/Lattice-1-Alpha',
|
|
578
|
-
device: str = 'cpu', # 'cpu', 'cuda', or 'mps'
|
|
579
|
-
)
|
|
580
|
-
```
|
|
581
|
-
|
|
582
|
-
### AsyncLattifAI (async)
|
|
583
|
-
|
|
584
|
-
```python
|
|
585
|
-
AsyncLattifAI(
|
|
586
|
-
api_key: Optional[str] = None,
|
|
587
|
-
model_name_or_path: str = 'Lattifai/Lattice-1-Alpha',
|
|
588
|
-
device: str = 'cpu',
|
|
589
|
-
)
|
|
590
|
-
```
|
|
591
|
-
|
|
592
|
-
Use `async with AsyncLattifAI() as client:` or call `await client.close()` when you are done to release the underlying HTTP session.
|
|
593
|
-
|
|
594
|
-
### alignment()
|
|
595
|
-
|
|
596
|
-
```python
|
|
597
|
-
client.alignment(
|
|
598
|
-
audio: str, # Path to audio file
|
|
599
|
-
subtitle: str, # Path to subtitle/text file
|
|
600
|
-
format: Optional[str] = None, # Input format: 'srt', 'vtt', 'ass', 'txt', 'gemini', or 'auto' (auto-detect if None)
|
|
601
|
-
split_sentence: bool = False, # Smart sentence splitting based on punctuation semantics
|
|
602
|
-
return_details: bool = False, # Enable word-level alignment details
|
|
603
|
-
output_subtitle_path: Optional[str] = None
|
|
604
|
-
) -> Tuple[List[Supervision], Optional[str]] # await client.alignment(...) for AsyncLattifAI
|
|
605
|
-
```
|
|
606
|
-
|
|
607
|
-
**Parameters**:
|
|
608
|
-
- `audio`: Path to the audio file to be aligned
|
|
609
|
-
- `subtitle`: Path to the subtitle or text file
|
|
610
|
-
- `format`: Input subtitle format. Supported values: 'srt', 'vtt', 'ass', 'txt', 'gemini', 'auto'. When set to None or 'auto', the format is automatically detected from file extension. Additional formats (ssa, sub, sbv) are supported through automatic format detection
|
|
611
|
-
- `split_sentence`: Enable intelligent sentence re-splitting (default: False). Set to True when subtitles combine multiple semantic units (non-speech elements + dialogue, or multiple sentences) that would benefit from separate timing alignment
|
|
612
|
-
- `return_details`: Enable word-level alignment details (default: False). When True, each `Supervision` object includes an `alignment` field with word-level timestamps, accessible via `supervision.alignment['word']`. This provides precise timing for each individual word within the segment
|
|
613
|
-
- `output_subtitle_path`: Output path for aligned subtitle (optional)
|
|
614
|
-
|
|
615
|
-
**Returns**:
|
|
616
|
-
- A tuple containing:
|
|
617
|
-
- `alignments`: List of aligned `Supervision` objects with timing information
|
|
618
|
-
- `output_subtitle_path`: Path where the subtitle was written (if `output_subtitle_path` was provided)
|
|
619
|
-
|
|
620
|
-
## Examples
|
|
621
|
-
|
|
622
|
-
### Basic Text Alignment
|
|
623
|
-
|
|
624
|
-
```python
|
|
625
|
-
from lattifai import LattifAI
|
|
626
|
-
|
|
627
|
-
client = LattifAI()
|
|
628
|
-
alignments, output_path = client.alignment(
|
|
629
|
-
audio="speech.wav",
|
|
630
|
-
subtitle="transcript.txt",
|
|
631
|
-
format="txt",
|
|
632
|
-
split_sentence=False,
|
|
633
|
-
output_subtitle_path="output.srt"
|
|
634
|
-
)
|
|
635
|
-
```
|
|
636
|
-
|
|
637
|
-
### Word-Level Alignment
|
|
638
|
-
|
|
639
|
-
```python
|
|
640
|
-
from lattifai import LattifAI
|
|
641
|
-
|
|
642
|
-
client = LattifAI()
|
|
643
|
-
alignments, output_path = client.alignment(
|
|
644
|
-
audio="speech.wav",
|
|
645
|
-
subtitle="transcript.srt",
|
|
646
|
-
return_details=True, # Enable word-level alignment
|
|
647
|
-
output_subtitle_path="output.json" # JSON format preserves word-level data
|
|
648
|
-
)
|
|
649
|
-
|
|
650
|
-
# Access word-level timestamps
|
|
651
|
-
for segment in alignments:
|
|
652
|
-
print(f"Segment: {segment.text} ({segment.start:.2f}s - {segment.end:.2f}s)")
|
|
653
|
-
if segment.alignment and 'word' in segment.alignment:
|
|
654
|
-
for word in segment.alignment['word']:
|
|
655
|
-
print(f" Word: {word.symbol} ({word.start:.2f}s - {word.end:.2f}s)")
|
|
656
|
-
```
|
|
657
|
-
|
|
658
|
-
### Batch Processing
|
|
659
|
-
|
|
660
|
-
```python
|
|
661
|
-
from pathlib import Path
|
|
662
|
-
from lattifai import LattifAI
|
|
663
|
-
|
|
664
|
-
client = LattifAI()
|
|
665
|
-
audio_dir = Path("audio_files")
|
|
666
|
-
subtitle_dir = Path("subtitles")
|
|
667
|
-
output_dir = Path("aligned")
|
|
668
|
-
|
|
669
|
-
for audio in audio_dir.glob("*.wav"):
|
|
670
|
-
subtitle = subtitle_dir / f"{audio.stem}.srt"
|
|
671
|
-
if subtitle.exists():
|
|
672
|
-
alignments, output_path = client.alignment(
|
|
673
|
-
audio=audio,
|
|
674
|
-
subtitle=subtitle,
|
|
675
|
-
output_subtitle_path=output_dir / f"{audio.stem}_aligned.srt"
|
|
676
|
-
)
|
|
677
|
-
```
|
|
678
|
-
|
|
679
|
-
### GPU Acceleration
|
|
680
|
-
|
|
681
|
-
```python
|
|
682
|
-
from lattifai import LattifAI
|
|
683
|
-
|
|
684
|
-
# NVIDIA GPU
|
|
685
|
-
client = LattifAI(device='cuda')
|
|
686
|
-
|
|
687
|
-
# Apple Silicon
|
|
688
|
-
client = LattifAI(device='mps')
|
|
689
|
-
|
|
690
|
-
# CLI
|
|
691
|
-
lai align --device mps audio.wav subtitle.srt output.srt
|
|
692
|
-
```
|
|
693
|
-
|
|
694
|
-
### YouTube Processing with Agent Workflow
|
|
695
|
-
|
|
696
|
-
```python
|
|
697
|
-
import asyncio
|
|
698
|
-
from lattifai.workflows import YouTubeSubtitleAgent
|
|
699
|
-
|
|
700
|
-
async def process_youtube():
|
|
701
|
-
# Initialize agent with configuration
|
|
702
|
-
agent = YouTubeSubtitleAgent(
|
|
703
|
-
gemini_api_key="your-gemini-api-key",
|
|
704
|
-
video_format="mp4", # or "mp3", "wav", etc.
|
|
705
|
-
output_format="srt",
|
|
706
|
-
max_retries=2,
|
|
707
|
-
split_sentence=True,
|
|
708
|
-
word_level=True,
|
|
709
|
-
force_overwrite=False
|
|
710
|
-
)
|
|
711
|
-
|
|
712
|
-
# Process YouTube URL
|
|
713
|
-
result = await agent.process_youtube_url(
|
|
714
|
-
url="https://www.youtube.com/watch?v=VIDEO_ID",
|
|
715
|
-
output_dir="./output",
|
|
716
|
-
output_format="srt"
|
|
717
|
-
)
|
|
718
|
-
|
|
719
|
-
# Access results
|
|
720
|
-
print(f"Title: {result['metadata']['title']}")
|
|
721
|
-
print(f"Duration: {result['metadata']['duration']} seconds")
|
|
722
|
-
print(f"Subtitle count: {result['subtitle_count']}")
|
|
723
|
-
|
|
724
|
-
# Access generated files
|
|
725
|
-
for format_name, file_path in result['exported_files'].items():
|
|
726
|
-
print(f"{format_name.upper()}: {file_path}")
|
|
727
|
-
|
|
728
|
-
# Run the async workflow
|
|
729
|
-
asyncio.run(process_youtube())
|
|
730
|
-
```
|
|
731
|
-
|
|
732
|
-
## Configuration
|
|
733
|
-
|
|
734
|
-
### API Key Setup
|
|
735
|
-
|
|
736
|
-
First, create your API key at [https://lattifai.com/dashboard/api-keys](https://lattifai.com/dashboard/api-keys)
|
|
737
|
-
|
|
738
|
-
**Recommended: Using .env file**
|
|
739
|
-
|
|
740
|
-
Create a `.env` file in your project root:
|
|
741
|
-
```bash
|
|
742
|
-
LATTIFAI_API_KEY=your-api-key
|
|
743
|
-
```
|
|
744
|
-
|
|
745
|
-
The library automatically loads the `.env` file (python-dotenv is included as a dependency).
|
|
746
|
-
|
|
747
|
-
**Alternative: Environment variable**
|
|
748
|
-
```bash
|
|
749
|
-
export LATTIFAI_API_KEY="your-api-key"
|
|
750
|
-
```
|
|
751
|
-
|
|
752
|
-
## Model Information
|
|
753
|
-
|
|
754
|
-
**[Lattice-1-Alpha](https://huggingface.co/Lattifai/Lattice-1-Alpha)** features:
|
|
755
|
-
- State-of-the-art alignment precision
|
|
756
|
-
- **Language Support**: Currently supports English only. The upcoming **Lattice-1** release will support English, Chinese, and mixed English-Chinese content.
|
|
757
|
-
- Handles noisy audio and imperfect transcripts
|
|
758
|
-
- Optimized for CPU and GPU (CUDA/MPS)
|
|
759
|
-
|
|
760
|
-
**Requirements**:
|
|
761
|
-
- Python 3.10 - 3.13 (3.14 support coming soon)
|
|
762
|
-
- 4GB RAM recommended
|
|
763
|
-
- ~2GB storage for model files
|
|
764
|
-
|
|
765
|
-
## Development
|
|
766
|
-
|
|
767
|
-
### Setup
|
|
768
|
-
|
|
769
|
-
```bash
|
|
770
|
-
git clone https://github.com/lattifai/lattifai-python.git
|
|
771
|
-
cd lattifai-python
|
|
772
|
-
pip install -e ".[test]"
|
|
773
|
-
./scripts/install-hooks.sh # Optional: install pre-commit hooks
|
|
774
|
-
```
|
|
775
|
-
|
|
776
|
-
### Testing
|
|
777
|
-
|
|
778
|
-
```bash
|
|
779
|
-
pytest # Run all tests
|
|
780
|
-
pytest --cov=src # With coverage
|
|
781
|
-
pytest tests/test_basic.py # Specific test
|
|
782
|
-
```
|
|
783
|
-
|
|
784
|
-
### Code Quality
|
|
785
|
-
|
|
786
|
-
```bash
|
|
787
|
-
ruff check src/ tests/ # Lint
|
|
788
|
-
ruff format src/ tests/ # Format
|
|
789
|
-
isort src/ tests/ # Sort imports
|
|
790
|
-
```
|
|
791
|
-
|
|
792
|
-
## Contributing
|
|
793
|
-
|
|
794
|
-
1. Fork the repository
|
|
795
|
-
2. Create a feature branch
|
|
796
|
-
3. Make changes and add tests
|
|
797
|
-
4. Run `pytest` and `ruff check`
|
|
798
|
-
5. Submit a pull request
|
|
799
|
-
|
|
800
|
-
## License
|
|
801
|
-
|
|
802
|
-
Apache License 2.0
|
|
803
|
-
|
|
804
|
-
## Support
|
|
805
|
-
|
|
806
|
-
- **Issues**: [GitHub Issues](https://github.com/lattifai/lattifai-python/issues)
|
|
807
|
-
- **Discussions**: [GitHub Discussions](https://github.com/lattifai/lattifai-python/discussions)
|
|
808
|
-
- **Discord**: [Join our community](https://discord.gg/kvF4WsBRK8)
|