lattifai 1.0.5__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/__init__.py +11 -12
- lattifai/alignment/lattice1_aligner.py +39 -7
- lattifai/alignment/lattice1_worker.py +135 -147
- lattifai/alignment/tokenizer.py +38 -22
- lattifai/audio2.py +1 -1
- lattifai/caption/caption.py +55 -19
- lattifai/cli/__init__.py +2 -0
- lattifai/cli/caption.py +1 -1
- lattifai/cli/diarization.py +110 -0
- lattifai/cli/transcribe.py +3 -1
- lattifai/cli/youtube.py +11 -0
- lattifai/client.py +32 -111
- lattifai/config/alignment.py +14 -0
- lattifai/config/client.py +5 -0
- lattifai/config/transcription.py +4 -0
- lattifai/diarization/lattifai.py +18 -7
- lattifai/mixin.py +26 -5
- lattifai/transcription/__init__.py +1 -1
- lattifai/transcription/base.py +21 -2
- lattifai/transcription/gemini.py +127 -1
- lattifai/transcription/lattifai.py +30 -2
- lattifai/utils.py +62 -69
- lattifai/workflow/youtube.py +55 -57
- {lattifai-1.0.5.dist-info → lattifai-1.2.0.dist-info}/METADATA +352 -56
- {lattifai-1.0.5.dist-info → lattifai-1.2.0.dist-info}/RECORD +29 -28
- {lattifai-1.0.5.dist-info → lattifai-1.2.0.dist-info}/entry_points.txt +2 -0
- {lattifai-1.0.5.dist-info → lattifai-1.2.0.dist-info}/WHEEL +0 -0
- {lattifai-1.0.5.dist-info → lattifai-1.2.0.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.0.5.dist-info → lattifai-1.2.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: lattifai
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.2.0
|
|
4
4
|
Summary: Lattifai Python SDK: Seamless Integration with Lattifai's Speech and Video AI Services
|
|
5
5
|
Author-email: Lattifai Technologies <tech@lattifai.com>
|
|
6
6
|
Maintainer-email: Lattice <tech@lattifai.com>
|
|
@@ -50,7 +50,8 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
50
50
|
Requires-Python: <3.15,>=3.10
|
|
51
51
|
Description-Content-Type: text/markdown
|
|
52
52
|
License-File: LICENSE
|
|
53
|
-
Requires-Dist:
|
|
53
|
+
Requires-Dist: k2py>=0.2.1
|
|
54
|
+
Requires-Dist: lattifai-core>=0.6.0
|
|
54
55
|
Requires-Dist: lattifai-run>=1.0.1
|
|
55
56
|
Requires-Dist: python-dotenv
|
|
56
57
|
Requires-Dist: lhotse>=1.26.0
|
|
@@ -61,11 +62,13 @@ Requires-Dist: tgt
|
|
|
61
62
|
Requires-Dist: onnx>=1.16.0
|
|
62
63
|
Requires-Dist: onnxruntime
|
|
63
64
|
Requires-Dist: msgpack
|
|
65
|
+
Requires-Dist: scipy!=1.16.3
|
|
64
66
|
Requires-Dist: g2p-phonemizer>=0.4.0
|
|
65
67
|
Requires-Dist: av
|
|
66
68
|
Requires-Dist: wtpsplit>=2.1.7
|
|
69
|
+
Requires-Dist: modelscope==1.33.0
|
|
67
70
|
Requires-Dist: OmniSenseVoice>=0.4.2
|
|
68
|
-
Requires-Dist: nemo_toolkit_asr[asr]>=2.7.
|
|
71
|
+
Requires-Dist: nemo_toolkit_asr[asr]>=2.7.0rc4
|
|
69
72
|
Requires-Dist: pyannote-audio-notorchdeps>=4.0.2
|
|
70
73
|
Requires-Dist: questionary>=2.0
|
|
71
74
|
Requires-Dist: yt-dlp
|
|
@@ -113,11 +116,9 @@ Dynamic: license-file
|
|
|
113
116
|
|
|
114
117
|
Advanced forced alignment and subtitle generation powered by [ 🤗 Lattice-1](https://huggingface.co/Lattifai/Lattice-1) model.
|
|
115
118
|
|
|
116
|
-
> **⚠️ Note on Current Limitations**:
|
|
117
|
-
> 1. **Memory Usage**: We are aware of high memory consumption and are actively working on further optimizations.
|
|
118
|
-
|
|
119
119
|
## Table of Contents
|
|
120
120
|
|
|
121
|
+
- [Core Capabilities](#core-capabilities)
|
|
121
122
|
- [Installation](#installation)
|
|
122
123
|
- [Quick Start](#quick-start)
|
|
123
124
|
- [Command Line Interface](#command-line-interface)
|
|
@@ -134,16 +135,45 @@ Advanced forced alignment and subtitle generation powered by [ 🤗 Lattice-1](h
|
|
|
134
135
|
- [YouTube Processing](#youtube-processing)
|
|
135
136
|
- [Configuration Objects](#configuration-objects)
|
|
136
137
|
- [Advanced Features](#advanced-features)
|
|
138
|
+
- [Audio Preprocessing](#audio-preprocessing)
|
|
139
|
+
- [Long-Form Audio Support](#long-form-audio-support)
|
|
137
140
|
- [Word-Level Alignment](#word-level-alignment)
|
|
138
141
|
- [Smart Sentence Splitting](#smart-sentence-splitting)
|
|
139
|
-
- [Speaker Diarization](#speaker-diarization
|
|
142
|
+
- [Speaker Diarization](#speaker-diarization)
|
|
140
143
|
- [YAML Configuration Files](#yaml-configuration-files)
|
|
144
|
+
- [Architecture Overview](#architecture-overview)
|
|
145
|
+
- [Performance & Optimization](#performance--optimization)
|
|
141
146
|
- [Supported Formats](#supported-formats)
|
|
147
|
+
- [Supported Languages](#supported-languages)
|
|
142
148
|
- [Roadmap](#roadmap)
|
|
143
149
|
- [Development](#development)
|
|
144
150
|
|
|
145
151
|
---
|
|
146
152
|
|
|
153
|
+
## Core Capabilities
|
|
154
|
+
|
|
155
|
+
LattifAI provides comprehensive audio-text alignment powered by the Lattice-1 model:
|
|
156
|
+
|
|
157
|
+
| Feature | Description | Status |
|
|
158
|
+
|---------|-------------|--------|
|
|
159
|
+
| **Forced Alignment** | Precise word-level and segment-level synchronization with audio | ✅ Production |
|
|
160
|
+
| **Multi-Model Transcription** | Gemini (100+ languages), Parakeet (24 languages), SenseVoice (5 languages) | ✅ Production |
|
|
161
|
+
| **Speaker Diarization** | Automatic multi-speaker identification with label preservation | ✅ Production |
|
|
162
|
+
| **Audio Preprocessing** | Multi-channel selection, device optimization (CPU/CUDA/MPS) | ✅ Production |
|
|
163
|
+
| **Streaming Mode** | Process audio up to 20 hours with minimal memory footprint | ✅ Production |
|
|
164
|
+
| **Smart Text Processing** | Intelligent sentence splitting and non-speech element separation | ✅ Production |
|
|
165
|
+
| **Universal Format Support** | 30+ caption/subtitle formats with text normalization | ✅ Production |
|
|
166
|
+
| **Configuration System** | YAML-based configs for reproducible workflows | ✅ Production |
|
|
167
|
+
|
|
168
|
+
**Key Highlights:**
|
|
169
|
+
- 🎯 **Accuracy**: State-of-the-art alignment precision with Lattice-1 model
|
|
170
|
+
- 🌍 **Multilingual**: Support for 100+ languages via multiple transcription models
|
|
171
|
+
- 🚀 **Performance**: Hardware-accelerated processing with streaming support
|
|
172
|
+
- 🔧 **Flexible**: CLI, Python SDK, and Web UI interfaces
|
|
173
|
+
- 📦 **Production-Ready**: Battle-tested on diverse audio/video content
|
|
174
|
+
|
|
175
|
+
---
|
|
176
|
+
|
|
147
177
|
## Installation
|
|
148
178
|
|
|
149
179
|
### Step 1: Install SDK
|
|
@@ -151,9 +181,6 @@ Advanced forced alignment and subtitle generation powered by [ 🤗 Lattice-1](h
|
|
|
151
181
|
**Using pip:**
|
|
152
182
|
```bash
|
|
153
183
|
|
|
154
|
-
pip install install-k2
|
|
155
|
-
install-k2 --torch-version 2.9.1 # if not set will auto-detect PyTorch version and install compatible k2
|
|
156
|
-
|
|
157
184
|
pip install lattifai
|
|
158
185
|
```
|
|
159
186
|
|
|
@@ -167,30 +194,11 @@ uv init my-project
|
|
|
167
194
|
cd my-project
|
|
168
195
|
source .venv/bin/activate
|
|
169
196
|
|
|
170
|
-
# Install k2 (required dependency)
|
|
171
|
-
uv pip install install-k2
|
|
172
|
-
uv pip install pip
|
|
173
|
-
uv run install-k2 --torch-version 2.9.1
|
|
174
|
-
|
|
175
197
|
# Install LattifAI
|
|
176
198
|
uv pip install lattifai
|
|
177
199
|
```
|
|
178
200
|
|
|
179
|
-
> **Note**: `install-k2` automatically detects your PyTorch version (up to 2.9) and installs the compatible k2 wheel.
|
|
180
201
|
|
|
181
|
-
<details>
|
|
182
|
-
<summary><b>install-k2 options</b></summary>
|
|
183
|
-
|
|
184
|
-
```
|
|
185
|
-
usage: install-k2 [-h] [--system {linux,darwin,windows}] [--dry-run] [--torch-version TORCH_VERSION]
|
|
186
|
-
|
|
187
|
-
optional arguments:
|
|
188
|
-
-h, help Show this help message and exit
|
|
189
|
-
--system {linux,darwin,windows} Override OS detection
|
|
190
|
-
--dry-run Show what would be installed without making changes
|
|
191
|
-
--torch-version TORCH_VERSION Specify torch version (e.g., 2.8.0)
|
|
192
|
-
```
|
|
193
|
-
</details>
|
|
194
202
|
|
|
195
203
|
### Step 2: Get Your API Key
|
|
196
204
|
|
|
@@ -254,7 +262,7 @@ caption = client.alignment(
|
|
|
254
262
|
|
|
255
263
|
That's it! Your aligned subtitles are saved to `aligned.srt`.
|
|
256
264
|
|
|
257
|
-
### Web Interface
|
|
265
|
+
### 🚧 Web Interface
|
|
258
266
|
|
|
259
267
|

|
|
260
268
|
|
|
@@ -312,13 +320,9 @@ That's it! Your aligned subtitles are saved to `aligned.srt`.
|
|
|
312
320
|
The web interface will automatically open in your browser at `http://localhost:5173`.
|
|
313
321
|
|
|
314
322
|
**Features:**
|
|
315
|
-
- ✅
|
|
316
|
-
- ✅
|
|
317
|
-
- ✅
|
|
318
|
-
- ✅ Multiple subtitle format support
|
|
319
|
-
- ✅ Built-in transcription with multiple models
|
|
320
|
-
- ✅ API key management interface
|
|
321
|
-
- ✅ Download aligned subtitles in various formats
|
|
323
|
+
- ✅ **Drag-and-Drop Upload**: Visual file upload for audio/video and captions
|
|
324
|
+
- ✅ **Real-Time Progress**: Live alignment progress with detailed status
|
|
325
|
+
- ✅ **Multiple Transcription Models**: Gemini, Parakeet, SenseVoice selection
|
|
322
326
|
|
|
323
327
|
---
|
|
324
328
|
|
|
@@ -619,6 +623,78 @@ from lattifai import (
|
|
|
619
623
|
|
|
620
624
|
## Advanced Features
|
|
621
625
|
|
|
626
|
+
### Audio Preprocessing
|
|
627
|
+
|
|
628
|
+
LattifAI provides powerful audio preprocessing capabilities for optimal alignment:
|
|
629
|
+
|
|
630
|
+
**Channel Selection**
|
|
631
|
+
|
|
632
|
+
Control which audio channel to process for stereo/multi-channel files:
|
|
633
|
+
|
|
634
|
+
```python
|
|
635
|
+
from lattifai import LattifAI
|
|
636
|
+
|
|
637
|
+
client = LattifAI()
|
|
638
|
+
|
|
639
|
+
# Use left channel only
|
|
640
|
+
caption = client.alignment(
|
|
641
|
+
input_media="stereo.wav",
|
|
642
|
+
input_caption="subtitle.srt",
|
|
643
|
+
channel_selector="left", # Options: "left", "right", "average", or channel index (0, 1, 2, ...)
|
|
644
|
+
)
|
|
645
|
+
|
|
646
|
+
# Average all channels (default)
|
|
647
|
+
caption = client.alignment(
|
|
648
|
+
input_media="stereo.wav",
|
|
649
|
+
input_caption="subtitle.srt",
|
|
650
|
+
channel_selector="average",
|
|
651
|
+
)
|
|
652
|
+
```
|
|
653
|
+
|
|
654
|
+
**CLI Usage:**
|
|
655
|
+
```bash
|
|
656
|
+
# Use right channel
|
|
657
|
+
lai alignment align audio.wav subtitle.srt output.srt \
|
|
658
|
+
media.channel_selector=right
|
|
659
|
+
|
|
660
|
+
# Use specific channel index
|
|
661
|
+
lai alignment align audio.wav subtitle.srt output.srt \
|
|
662
|
+
media.channel_selector=1
|
|
663
|
+
```
|
|
664
|
+
|
|
665
|
+
**Device Management**
|
|
666
|
+
|
|
667
|
+
Optimize processing for your hardware:
|
|
668
|
+
|
|
669
|
+
```python
|
|
670
|
+
from lattifai import LattifAI, AlignmentConfig
|
|
671
|
+
|
|
672
|
+
# Use CUDA GPU
|
|
673
|
+
client = LattifAI(
|
|
674
|
+
alignment_config=AlignmentConfig(device="cuda")
|
|
675
|
+
)
|
|
676
|
+
|
|
677
|
+
# Use specific GPU
|
|
678
|
+
client = LattifAI(
|
|
679
|
+
alignment_config=AlignmentConfig(device="cuda:0")
|
|
680
|
+
)
|
|
681
|
+
|
|
682
|
+
# Use Apple Silicon MPS
|
|
683
|
+
client = LattifAI(
|
|
684
|
+
alignment_config=AlignmentConfig(device="mps")
|
|
685
|
+
)
|
|
686
|
+
|
|
687
|
+
# Use CPU
|
|
688
|
+
client = LattifAI(
|
|
689
|
+
alignment_config=AlignmentConfig(device="cpu")
|
|
690
|
+
)
|
|
691
|
+
```
|
|
692
|
+
|
|
693
|
+
**Supported Formats**
|
|
694
|
+
- **Audio**: WAV, MP3, M4A, AAC, FLAC, OGG, OPUS, AIFF, and more
|
|
695
|
+
- **Video**: MP4, MKV, MOV, WEBM, AVI, and more
|
|
696
|
+
- All formats supported by FFmpeg are compatible
|
|
697
|
+
|
|
622
698
|
### Long-Form Audio Support
|
|
623
699
|
|
|
624
700
|
LattifAI now supports processing long audio files (up to 20 hours) through streaming mode. Enable streaming by setting the `streaming_chunk_secs` parameter:
|
|
@@ -660,14 +736,18 @@ client = LattifAI(
|
|
|
660
736
|
)
|
|
661
737
|
```
|
|
662
738
|
|
|
663
|
-
**
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
739
|
+
**Technical Details:**
|
|
740
|
+
|
|
741
|
+
| Parameter | Description | Recommendation |
|
|
742
|
+
|-----------|-------------|----------------|
|
|
743
|
+
| **Default Value** | 600 seconds (10 minutes) | Good for most use cases |
|
|
744
|
+
| **Memory Impact** | Lower chunks = less RAM usage | Adjust based on available RAM |
|
|
745
|
+
| **Accuracy Impact** | Virtually zero degradation | Our precise implementation preserves quality |
|
|
746
|
+
|
|
747
|
+
**Performance Characteristics:**
|
|
748
|
+
- ✅ **Near-Perfect Accuracy**: Streaming implementation maintains alignment precision
|
|
749
|
+
- 🚧 **Memory Efficient**: Process 20-hour audio with <10GB RAM (600-sec chunks)
|
|
750
|
+
|
|
671
751
|
|
|
672
752
|
### Word-Level Alignment
|
|
673
753
|
|
|
@@ -708,18 +788,50 @@ caption = client.alignment(
|
|
|
708
788
|
)
|
|
709
789
|
```
|
|
710
790
|
|
|
711
|
-
### Speaker Diarization
|
|
791
|
+
### Speaker Diarization
|
|
792
|
+
|
|
793
|
+
Speaker diarization automatically identifies and labels different speakers in audio using state-of-the-art models.
|
|
794
|
+
|
|
795
|
+
**Core Capabilities:**
|
|
796
|
+
- 🎤 **Multi-Speaker Detection**: Automatically detect speaker changes in audio
|
|
797
|
+
- 🏷️ **Smart Labeling**: Assign speaker labels (SPEAKER_00, SPEAKER_01, etc.)
|
|
798
|
+
- 🔄 **Label Preservation**: Maintain existing speaker names from input captions
|
|
799
|
+
- 🤖 **Gemini Integration**: Extract speaker names intelligently during transcription
|
|
800
|
+
|
|
801
|
+
**How It Works:**
|
|
802
|
+
|
|
803
|
+
1. **Without Existing Labels**: System assigns generic labels (SPEAKER_00, SPEAKER_01)
|
|
804
|
+
2. **With Existing Labels**: System preserves your speaker names during alignment
|
|
805
|
+
- Formats: `[Alice]`, `>> Bob:`, `SPEAKER_01:`, `Alice:` are all recognized
|
|
806
|
+
3. **Gemini Transcription**: When using Gemini models, speaker names are extracted from context
|
|
807
|
+
- Example: "Hi, I'm Alice" → System labels as `Alice` instead of `SPEAKER_00`
|
|
808
|
+
|
|
809
|
+
**Speaker Label Integration:**
|
|
810
|
+
|
|
811
|
+
The diarization engine intelligently matches detected speakers with existing labels:
|
|
812
|
+
- If input captions have speaker names → **Preserved during alignment**
|
|
813
|
+
- If Gemini transcription provides names → **Used for labeling**
|
|
814
|
+
- Otherwise → **Generic labels (SPEAKER_00, etc.) assigned**
|
|
815
|
+
* 🚧 **Future Enhancement:**
|
|
816
|
+
- **AI-Powered Speaker Name Inference**: Upcoming feature will use large language models combined with metadata (video title, description, context) to intelligently infer speaker names, making transcripts more human-readable and contextually accurate
|
|
712
817
|
|
|
713
|
-
**
|
|
818
|
+
**CLI:**
|
|
819
|
+
```bash
|
|
820
|
+
# Enable speaker diarization during alignment
|
|
821
|
+
lai alignment align audio.wav subtitle.srt output.srt \
|
|
822
|
+
diarization.enabled=true
|
|
714
823
|
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
824
|
+
# With additional diarization settings
|
|
825
|
+
lai alignment align audio.wav subtitle.srt output.srt \
|
|
826
|
+
diarization.enabled=true \
|
|
827
|
+
diarization.device=cuda \
|
|
828
|
+
diarization.min_speakers=2 \
|
|
829
|
+
diarization.max_speakers=4
|
|
719
830
|
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
831
|
+
# For YouTube videos with diarization
|
|
832
|
+
lai alignment youtube "https://youtube.com/watch?v=VIDEO_ID" \
|
|
833
|
+
diarization.enabled=true
|
|
834
|
+
```
|
|
723
835
|
|
|
724
836
|
**Python SDK:**
|
|
725
837
|
```python
|
|
@@ -742,6 +854,8 @@ for segment in caption.supervisions:
|
|
|
742
854
|
|
|
743
855
|
### YAML Configuration Files
|
|
744
856
|
|
|
857
|
+
* **under development**
|
|
858
|
+
|
|
745
859
|
Create reusable configuration files:
|
|
746
860
|
|
|
747
861
|
```yaml
|
|
@@ -758,6 +872,125 @@ lai alignment align audio.wav subtitle.srt output.srt \
|
|
|
758
872
|
|
|
759
873
|
---
|
|
760
874
|
|
|
875
|
+
## Architecture Overview
|
|
876
|
+
|
|
877
|
+
LattifAI uses a modular, config-driven architecture for maximum flexibility:
|
|
878
|
+
|
|
879
|
+
```
|
|
880
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
881
|
+
│ LattifAI Client │
|
|
882
|
+
├─────────────────────────────────────────────────────────────┤
|
|
883
|
+
│ Configuration Layer (Config-Driven) │
|
|
884
|
+
│ ├── ClientConfig (API settings) │
|
|
885
|
+
│ ├── AlignmentConfig (Model & device) │
|
|
886
|
+
│ ├── CaptionConfig (I/O formats) │
|
|
887
|
+
│ ├── TranscriptionConfig (ASR models) │
|
|
888
|
+
│ └── DiarizationConfig (Speaker detection) │
|
|
889
|
+
├─────────────────────────────────────────────────────────────┤
|
|
890
|
+
│ Core Components │
|
|
891
|
+
│ ├── AudioLoader → Load & preprocess audio │
|
|
892
|
+
│ ├── Aligner → Lattice-1 forced alignment │
|
|
893
|
+
│ ├── Transcriber → Multi-model ASR │
|
|
894
|
+
│ ├── Diarizer → Speaker identification │
|
|
895
|
+
│ └── Tokenizer → Intelligent text segmentation │
|
|
896
|
+
├─────────────────────────────────────────────────────────────┤
|
|
897
|
+
│ Data Flow │
|
|
898
|
+
│ Input → AudioLoader → Aligner → Diarizer → Caption │
|
|
899
|
+
│ ↓ │
|
|
900
|
+
│ Transcriber (optional) │
|
|
901
|
+
└─────────────────────────────────────────────────────────────┘
|
|
902
|
+
```
|
|
903
|
+
|
|
904
|
+
**Component Responsibilities:**
|
|
905
|
+
|
|
906
|
+
| Component | Purpose | Configuration |
|
|
907
|
+
|-----------|---------|---------------|
|
|
908
|
+
| **AudioLoader** | Load audio/video, channel selection, format conversion | `MediaConfig` |
|
|
909
|
+
| **Aligner** | Forced alignment using Lattice-1 model | `AlignmentConfig` |
|
|
910
|
+
| **Transcriber** | ASR with Gemini/Parakeet/SenseVoice | `TranscriptionConfig` |
|
|
911
|
+
| **Diarizer** | Speaker diarization with pyannote.audio | `DiarizationConfig` |
|
|
912
|
+
| **Tokenizer** | Sentence splitting and text normalization | `CaptionConfig` |
|
|
913
|
+
| **Caption** | Unified data structure for alignments | `CaptionConfig` |
|
|
914
|
+
|
|
915
|
+
**Data Flow:**
|
|
916
|
+
|
|
917
|
+
1. **Audio Loading**: `AudioLoader` loads media, applies channel selection, converts to numpy array
|
|
918
|
+
2. **Transcription** (optional): `Transcriber` generates transcript if no caption provided
|
|
919
|
+
3. **Text Preprocessing**: `Tokenizer` splits sentences and normalizes text
|
|
920
|
+
4. **Alignment**: `Aligner` uses Lattice-1 to compute word-level timestamps
|
|
921
|
+
5. **Diarization** (optional): `Diarizer` identifies speakers and assigns labels
|
|
922
|
+
6. **Output**: `Caption` object contains all results, exported to desired format
|
|
923
|
+
|
|
924
|
+
**Configuration Philosophy:**
|
|
925
|
+
- ✅ **Declarative**: Describe what you want, not how to do it
|
|
926
|
+
- ✅ **Composable**: Mix and match configurations
|
|
927
|
+
- ✅ **Reproducible**: Save configs to YAML for consistent results
|
|
928
|
+
- ✅ **Flexible**: Override configs per-method or globally
|
|
929
|
+
|
|
930
|
+
---
|
|
931
|
+
|
|
932
|
+
## Performance & Optimization
|
|
933
|
+
|
|
934
|
+
### Device Selection
|
|
935
|
+
|
|
936
|
+
Choose the optimal device for your hardware:
|
|
937
|
+
|
|
938
|
+
```python
|
|
939
|
+
from lattifai import LattifAI, AlignmentConfig
|
|
940
|
+
|
|
941
|
+
# NVIDIA GPU (recommended for speed)
|
|
942
|
+
client = LattifAI(
|
|
943
|
+
alignment_config=AlignmentConfig(device="cuda")
|
|
944
|
+
)
|
|
945
|
+
|
|
946
|
+
# Apple Silicon GPU
|
|
947
|
+
client = LattifAI(
|
|
948
|
+
alignment_config=AlignmentConfig(device="mps")
|
|
949
|
+
)
|
|
950
|
+
|
|
951
|
+
# CPU (maximum compatibility)
|
|
952
|
+
client = LattifAI(
|
|
953
|
+
alignment_config=AlignmentConfig(device="cpu")
|
|
954
|
+
)
|
|
955
|
+
```
|
|
956
|
+
|
|
957
|
+
**Performance Comparison** (30-minute audio):
|
|
958
|
+
|
|
959
|
+
| Device | Time |
|
|
960
|
+
|--------|------|
|
|
961
|
+
| CUDA (RTX 4090) | ~18 sec |
|
|
962
|
+
| MPS (M4) | ~26 sec |
|
|
963
|
+
|
|
964
|
+
### Memory Management
|
|
965
|
+
|
|
966
|
+
**Streaming Mode** for long audio:
|
|
967
|
+
|
|
968
|
+
```python
|
|
969
|
+
# Process 20-hour audio with <10GB RAM
|
|
970
|
+
caption = client.alignment(
|
|
971
|
+
input_media="long_audio.wav",
|
|
972
|
+
input_caption="subtitle.srt",
|
|
973
|
+
streaming_chunk_secs=600.0, # 10-minute chunks
|
|
974
|
+
)
|
|
975
|
+
```
|
|
976
|
+
|
|
977
|
+
**Memory Usage** (approximate):
|
|
978
|
+
|
|
979
|
+
| Chunk Size | Peak RAM | Suitable For |
|
|
980
|
+
|------------|----------|-------------|
|
|
981
|
+
| 600 sec | ~5 GB | Recommended |
|
|
982
|
+
| No streaming | ~10 GB+ | Short audio only |
|
|
983
|
+
|
|
984
|
+
### Optimization Tips
|
|
985
|
+
|
|
986
|
+
1. **Use GPU when available**: 10x faster than CPU
|
|
987
|
+
2. **WIP: Enable streaming for long audio**: Process 20+ hour files without OOM
|
|
988
|
+
3. **Choose appropriate chunk size**: Balance memory vs. performance
|
|
989
|
+
4. **Batch processing**: Process multiple files in sequence (coming soon)
|
|
990
|
+
5. **Profile alignment**: Set `client.profile=True` to identify bottlenecks
|
|
991
|
+
|
|
992
|
+
---
|
|
993
|
+
|
|
761
994
|
## Supported Formats
|
|
762
995
|
|
|
763
996
|
LattifAI supports virtually all common media and subtitle formats:
|
|
@@ -778,14 +1011,77 @@ LattifAI supports virtually all common media and subtitle formats:
|
|
|
778
1011
|
|
|
779
1012
|
---
|
|
780
1013
|
|
|
1014
|
+
## Supported Languages
|
|
1015
|
+
|
|
1016
|
+
LattifAI supports multiple transcription models with different language capabilities:
|
|
1017
|
+
|
|
1018
|
+
### Gemini Models (100+ Languages)
|
|
1019
|
+
|
|
1020
|
+
**Models**: `gemini-2.5-pro`, `gemini-3-pro-preview`, `gemini-3-flash-preview`
|
|
1021
|
+
|
|
1022
|
+
**Supported Languages**: English, Chinese (Mandarin & Cantonese), Spanish, French, German, Italian, Portuguese, Japanese, Korean, Arabic, Russian, Hindi, Bengali, Turkish, Dutch, Polish, Swedish, Danish, Norwegian, Finnish, Greek, Hebrew, Thai, Vietnamese, Indonesian, Malay, Filipino, Ukrainian, Czech, Romanian, Hungarian, Swahili, Tamil, Telugu, Marathi, Gujarati, Kannada, and 70+ more languages.
|
|
1023
|
+
|
|
1024
|
+
> **Note**: Requires Gemini API key from [Google AI Studio](https://aistudio.google.com/apikey)
|
|
1025
|
+
|
|
1026
|
+
### NVIDIA Parakeet (24 European Languages)
|
|
1027
|
+
|
|
1028
|
+
**Model**: `nvidia/parakeet-tdt-0.6b-v3`
|
|
1029
|
+
|
|
1030
|
+
**Supported Languages**:
|
|
1031
|
+
- **Western Europe**: English (en), French (fr), German (de), Spanish (es), Italian (it), Portuguese (pt), Dutch (nl)
|
|
1032
|
+
- **Nordic**: Danish (da), Swedish (sv), Norwegian (no), Finnish (fi)
|
|
1033
|
+
- **Eastern Europe**: Polish (pl), Czech (cs), Slovak (sk), Hungarian (hu), Romanian (ro), Bulgarian (bg), Ukrainian (uk), Russian (ru)
|
|
1034
|
+
- **Others**: Croatian (hr), Estonian (et), Latvian (lv), Lithuanian (lt), Slovenian (sl), Maltese (mt), Greek (el)
|
|
1035
|
+
|
|
1036
|
+
### Alibaba SenseVoice (5 Asian Languages)
|
|
1037
|
+
|
|
1038
|
+
**Model**: `iic/SenseVoiceSmall`
|
|
1039
|
+
|
|
1040
|
+
**Supported Languages**:
|
|
1041
|
+
- Chinese/Mandarin (zh)
|
|
1042
|
+
- English (en)
|
|
1043
|
+
- Japanese (ja)
|
|
1044
|
+
- Korean (ko)
|
|
1045
|
+
- Cantonese (yue)
|
|
1046
|
+
|
|
1047
|
+
### Language Selection
|
|
1048
|
+
|
|
1049
|
+
```python
|
|
1050
|
+
from lattifai import LattifAI, TranscriptionConfig
|
|
1051
|
+
|
|
1052
|
+
# Specify language for transcription
|
|
1053
|
+
client = LattifAI(
|
|
1054
|
+
transcription_config=TranscriptionConfig(
|
|
1055
|
+
model_name="nvidia/parakeet-tdt-0.6b-v3",
|
|
1056
|
+
language="de", # German
|
|
1057
|
+
)
|
|
1058
|
+
)
|
|
1059
|
+
```
|
|
1060
|
+
|
|
1061
|
+
**CLI Usage:**
|
|
1062
|
+
```bash
|
|
1063
|
+
lai transcribe run audio.wav output.srt \
|
|
1064
|
+
transcription.model_name=nvidia/parakeet-tdt-0.6b-v3 \
|
|
1065
|
+
transcription.language=de
|
|
1066
|
+
```
|
|
1067
|
+
|
|
1068
|
+
> **Tip**: Use Gemini models for maximum language coverage, Parakeet for European languages, and SenseVoice for Asian languages.
|
|
1069
|
+
|
|
1070
|
+
---
|
|
1071
|
+
|
|
781
1072
|
## Roadmap
|
|
782
1073
|
|
|
783
1074
|
Visit our [LattifAI roadmap](https://lattifai.com/roadmap) for the latest updates.
|
|
784
1075
|
|
|
785
|
-
| Date | Release | Features |
|
|
1076
|
+
| Date | Model Release | Features |
|
|
786
1077
|
|------|---------|----------|
|
|
787
1078
|
| **Oct 2025** | **Lattice-1-Alpha** | ✅ English forced alignment<br>✅ Multi-format support<br>✅ CPU/GPU optimization |
|
|
788
|
-
| **Nov 2025** | **Lattice-1** | ✅ English + Chinese + German<br>✅ Mixed languages alignment<br
|
|
1079
|
+
| **Nov 2025** | **Lattice-1** | ✅ English + Chinese + German<br>✅ Mixed languages alignment<br>✅ Speaker Diarization<br>✅ Multi-model transcription (Gemini, Parakeet, SenseVoice)<br>✅ Web interface with React<br>🚧 Advanced segmentation strategies (entire/transcription/hybrid)<br>🚧 Audio event detection ([MUSIC], [APPLAUSE], etc.)<br> |
|
|
1080
|
+
| **Q1 2026** | **Lattice-2** | ✅ Streaming mode for long audio<br>🔮 40+ languages support<br>🔮 Real-time alignment |
|
|
1081
|
+
|
|
1082
|
+
|
|
1083
|
+
|
|
1084
|
+
**Legend**: ✅ Released | 🚧 In Development | 📋 Planned | 🔮 Future
|
|
789
1085
|
|
|
790
1086
|
---
|
|
791
1087
|
|
|
@@ -1,44 +1,45 @@
|
|
|
1
|
-
lattifai/__init__.py,sha256=
|
|
2
|
-
lattifai/audio2.py,sha256=
|
|
3
|
-
lattifai/client.py,sha256=
|
|
1
|
+
lattifai/__init__.py,sha256=l7dIodSCVMHUXQkd8BVGBoDdYojBCh_lyBWlVibynk8,2695
|
|
2
|
+
lattifai/audio2.py,sha256=P3N8_BwiscbetzDbkbj-n8BcMu2vWD6-MvtQvGwWWf0,17448
|
|
3
|
+
lattifai/client.py,sha256=Vqg4vY--6tox9Js0qGWlE7IGeHJVyQeYLTXYtlzPk3w,19020
|
|
4
4
|
lattifai/errors.py,sha256=LyWRGVhQ6Ak2CYn9FBYAPRgQ7_VHpxzNsXI31HXD--s,11291
|
|
5
5
|
lattifai/logging.py,sha256=MbUEeOUFlF92pA9v532DiPPWKl03S7UHCJ6Z652cf0w,2860
|
|
6
|
-
lattifai/mixin.py,sha256=
|
|
6
|
+
lattifai/mixin.py,sha256=wdgxEhgxR--dHXmeiJZ4AQDxEjKo49GLYQ0BXJw3qpk,25206
|
|
7
7
|
lattifai/types.py,sha256=SjYBfwrCBOXlICvH04niFQJ7OzTx7oTaa_npfRkB67U,659
|
|
8
|
-
lattifai/utils.py,sha256=
|
|
8
|
+
lattifai/utils.py,sha256=cMiC5CY6gSMtcOtf_wK1BBMBEfHwc5R_S8_NIoVYk6I,5321
|
|
9
9
|
lattifai/alignment/__init__.py,sha256=ehpkKfjNIYUx7_M-RWD_8Efcrzd9bE-NSm0QgMMVLW0,178
|
|
10
|
-
lattifai/alignment/lattice1_aligner.py,sha256=
|
|
11
|
-
lattifai/alignment/lattice1_worker.py,sha256=
|
|
10
|
+
lattifai/alignment/lattice1_aligner.py,sha256=wm1BWNu4h-b507OAvLV0ITi7g0qaWthOPwvzWFHKyZQ,6251
|
|
11
|
+
lattifai/alignment/lattice1_worker.py,sha256=ls2o3pVChB63OQrElJOmHzYIhCkjBFPt8EsLIVR1sJ0,11104
|
|
12
12
|
lattifai/alignment/phonemizer.py,sha256=fbhN2DOl39lW4nQWKzyUUTMUabg7v61lB1kj8SKK-Sw,1761
|
|
13
13
|
lattifai/alignment/segmenter.py,sha256=mzWEQC6hWZtI2mR2WU59W7qLHa7KXy7fdU6991kyUuQ,6276
|
|
14
|
-
lattifai/alignment/tokenizer.py,sha256=
|
|
14
|
+
lattifai/alignment/tokenizer.py,sha256=JY11uEe-v4KQLoHZuaHgdFqgxR3u_1D9ZXXMnB6hA-Q,22994
|
|
15
15
|
lattifai/caption/__init__.py,sha256=6MM_2j6CaqwZ81LfSy4di2EP0ykvheRjMZKAYDx2rQs,477
|
|
16
|
-
lattifai/caption/caption.py,sha256=
|
|
16
|
+
lattifai/caption/caption.py,sha256=mZYobxuZ8tkJUkZMVvRTrNeGTdmIZYSXTEySQdaGQd8,54595
|
|
17
17
|
lattifai/caption/gemini_reader.py,sha256=GqY2w78xGYCMDP5kD5WGS8jK0gntel2SK-EPpPKTrwU,15138
|
|
18
18
|
lattifai/caption/gemini_writer.py,sha256=sYPxYEmVQcEan5WVGgSrcraxs3QJRQRh8CJkl2yUQ1s,6515
|
|
19
19
|
lattifai/caption/supervision.py,sha256=DRrM8lfKU_x9aVBcLG6xnT0xIJrnc8jzHpzcSwQOg8c,905
|
|
20
20
|
lattifai/caption/text_parser.py,sha256=XDb8KTt031uJ1hg6dpbINglGOTX-6pBcghbg3DULM1I,4633
|
|
21
|
-
lattifai/cli/__init__.py,sha256=
|
|
21
|
+
lattifai/cli/__init__.py,sha256=LafsAf8YfDcfTeJ1IevFcyLm-mNbxpOOnm33OFKtpDM,523
|
|
22
22
|
lattifai/cli/alignment.py,sha256=06em-Uaf6NhSz1ce4dwT2r8n56NrtibR7ZsSkmc18Kc,5954
|
|
23
23
|
lattifai/cli/app_installer.py,sha256=gAndH3Yo97fGRDe2CQnGtOgZZ4k3_v5ftcUo5g6xbSA,5884
|
|
24
|
-
lattifai/cli/caption.py,sha256=
|
|
24
|
+
lattifai/cli/caption.py,sha256=4qQ9DFhxcfaeFMY0TB5I42x4W_gOo2zY6kjXnHnFDms,6313
|
|
25
|
+
lattifai/cli/diarization.py,sha256=GTd2vnTm6cJN6Q3mFP-ShY9bZBl1_zKzWFu-4HHcMzk,4075
|
|
25
26
|
lattifai/cli/server.py,sha256=sXMfOSse9-V79slXUU8FDLeqtI5U9zeU-5YpjTIGyVw,1186
|
|
26
|
-
lattifai/cli/transcribe.py,sha256=
|
|
27
|
-
lattifai/cli/youtube.py,sha256
|
|
27
|
+
lattifai/cli/transcribe.py,sha256=_vHzrdaGiPepQWATqvEDYDjwzfVLAd2i8RjOLkvdb0w,8218
|
|
28
|
+
lattifai/cli/youtube.py,sha256=9M2dpcUCvT7vVbXJCIxJwe9klJXoF2jUeLxiatslYso,6063
|
|
28
29
|
lattifai/config/__init__.py,sha256=Z8OudvS6fgfLNLu_2fvoXartQiYCECOnNfzDt-PfCN4,543
|
|
29
|
-
lattifai/config/alignment.py,sha256=
|
|
30
|
+
lattifai/config/alignment.py,sha256=vLiH150YWvBUiVkFOIO-nPXCB-b8fP9iSZgS79k1Qbg,4586
|
|
30
31
|
lattifai/config/caption.py,sha256=AYOyUJ1xZsX8CBZy3GpLitbcCAHcZ9LwXui_v3vtuso,6786
|
|
31
|
-
lattifai/config/client.py,sha256=
|
|
32
|
+
lattifai/config/client.py,sha256=46b816MiYja3Uan_3wjnhtqDr0M6T-FqEygJ3e50IZc,1664
|
|
32
33
|
lattifai/config/diarization.py,sha256=cIkwCfsYqfMns3i6tKWcwBBBkdnhhmB_Eo0TuOPCw9o,2484
|
|
33
34
|
lattifai/config/media.py,sha256=cjM8eGeZ7ELhmy4cCqHAyogeHItaVqMrPzSwwIx79HY,14856
|
|
34
|
-
lattifai/config/transcription.py,sha256=
|
|
35
|
+
lattifai/config/transcription.py,sha256=_gPJD6cob_jWNdf841nBHhAqJGCxS6PfSyvx2W_vPcM,3082
|
|
35
36
|
lattifai/diarization/__init__.py,sha256=MgBDQ1ehL2qDnZprEp8KqON7CmbG-qaP37gzBsV0jzk,119
|
|
36
|
-
lattifai/diarization/lattifai.py,sha256=
|
|
37
|
+
lattifai/diarization/lattifai.py,sha256=tCnFL6ywITqeKR8YoCsYvyJxNoIwoC6GsnI9zkXNB-Q,3128
|
|
37
38
|
lattifai/server/app.py,sha256=wXYgXc_yGQACtUJdhkfhLsTOQjhhIhDQRiVRny7Ogcs,15455
|
|
38
|
-
lattifai/transcription/__init__.py,sha256=
|
|
39
|
-
lattifai/transcription/base.py,sha256=
|
|
40
|
-
lattifai/transcription/gemini.py,sha256=
|
|
41
|
-
lattifai/transcription/lattifai.py,sha256=
|
|
39
|
+
lattifai/transcription/__init__.py,sha256=vMHciyCEPKhhfM3KjMCeDqnyxU1oghF8g5o5SvpnT_4,2669
|
|
40
|
+
lattifai/transcription/base.py,sha256=v_b1_JGYiBqeMmwns0wHCJ7UOm6j9k-76Uzbr-qmzrs,4467
|
|
41
|
+
lattifai/transcription/gemini.py,sha256=LJSQt9nGqQdEG6ZFXoHWltumyMEM7-Ezy8ss0iPJb7k,12414
|
|
42
|
+
lattifai/transcription/lattifai.py,sha256=EKEdCafgdRWKw_084eD07BqGh2_D-qo3ig3H5X3XYGg,4621
|
|
42
43
|
lattifai/transcription/prompts/README.md,sha256=X49KWSQVdjWxxWUp4R2w3ZqKrAOi6_kDNHh1hMaQ4PE,694
|
|
43
44
|
lattifai/transcription/prompts/__init__.py,sha256=G9b42COaCYv3sPPNkHsGDLOMBuVGKt4mXGYal_BYtYQ,1351
|
|
44
45
|
lattifai/transcription/prompts/gemini/README.md,sha256=rt7f7yDGtaobKBo95LG3u56mqa3ABOXQd0UVgJYtYuo,781
|
|
@@ -47,10 +48,10 @@ lattifai/workflow/__init__.py,sha256=GOT9jptXwpIMiNRqJ_LToEt_5Dt0k7XXbLkFzhrl31o
|
|
|
47
48
|
lattifai/workflow/agents.py,sha256=yEOnxnhcTvr1iOhCorNvp8B76P6nQsLRXJCu_rCYFfM,38
|
|
48
49
|
lattifai/workflow/base.py,sha256=8QoVIBZwJfr5mppJbtUFafHv5QR9lL-XrULjTWD0oBg,6257
|
|
49
50
|
lattifai/workflow/file_manager.py,sha256=IUWW838ta83kfwM4gpW83gsD_Tx-pa-L_RWKjiefQbQ,33017
|
|
50
|
-
lattifai/workflow/youtube.py,sha256=
|
|
51
|
-
lattifai-1.0.
|
|
52
|
-
lattifai-1.0.
|
|
53
|
-
lattifai-1.0.
|
|
54
|
-
lattifai-1.0.
|
|
55
|
-
lattifai-1.0.
|
|
56
|
-
lattifai-1.0.
|
|
51
|
+
lattifai/workflow/youtube.py,sha256=0B1l_8gdz_O0cy2c9AY9wRPizESQrpRuCP4rwvWRxLA,23687
|
|
52
|
+
lattifai-1.2.0.dist-info/licenses/LICENSE,sha256=xGMLmdFJy6Jkz3Hd0znyQLmcxC93FSZB5isKnEDMoQQ,1066
|
|
53
|
+
lattifai-1.2.0.dist-info/METADATA,sha256=9iEaT3muzKIUmIvQ0oqg4DhM_CvZ53jHvk97kHfPNlQ,37399
|
|
54
|
+
lattifai-1.2.0.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
|
|
55
|
+
lattifai-1.2.0.dist-info/entry_points.txt,sha256=nHZri2VQkPYEl0tQ0dMYTpVGlCOgVWlDG_JtDR3QXF8,545
|
|
56
|
+
lattifai-1.2.0.dist-info/top_level.txt,sha256=tHSoXF26r-IGfbIP_JoYATqbmf14h5NrnNJGH4j5reI,9
|
|
57
|
+
lattifai-1.2.0.dist-info/RECORD,,
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
[console_scripts]
|
|
2
2
|
lai-align = lattifai.cli.alignment:main
|
|
3
3
|
lai-app-install = lattifai.cli.app_installer:main
|
|
4
|
+
lai-diarize = lattifai.cli.diarization:main
|
|
4
5
|
lai-server = lattifai.cli.server:main
|
|
5
6
|
lai-transcribe = lattifai.cli.transcribe:main
|
|
6
7
|
lai-youtube = lattifai.cli.youtube:main
|
|
@@ -11,4 +12,5 @@ laicap-shift = lattifai.cli.caption:main_shift
|
|
|
11
12
|
[lai_run.cli]
|
|
12
13
|
alignment = lattifai.cli
|
|
13
14
|
caption = lattifai.cli
|
|
15
|
+
diarization = lattifai.cli
|
|
14
16
|
transcribe = lattifai.cli
|
|
File without changes
|
|
File without changes
|
|
File without changes
|