lattifai 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/__init__.py +11 -12
- lattifai/alignment/lattice1_aligner.py +11 -8
- lattifai/alignment/lattice1_worker.py +125 -151
- lattifai/alignment/tokenizer.py +27 -12
- lattifai/audio2.py +1 -1
- lattifai/cli/diarization.py +3 -1
- lattifai/cli/youtube.py +11 -0
- lattifai/client.py +5 -0
- lattifai/config/client.py +5 -0
- lattifai/mixin.py +7 -4
- lattifai/utils.py +21 -59
- lattifai/workflow/youtube.py +55 -57
- {lattifai-1.1.0.dist-info → lattifai-1.2.0.dist-info}/METADATA +330 -48
- {lattifai-1.1.0.dist-info → lattifai-1.2.0.dist-info}/RECORD +18 -18
- {lattifai-1.1.0.dist-info → lattifai-1.2.0.dist-info}/WHEEL +0 -0
- {lattifai-1.1.0.dist-info → lattifai-1.2.0.dist-info}/entry_points.txt +0 -0
- {lattifai-1.1.0.dist-info → lattifai-1.2.0.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.1.0.dist-info → lattifai-1.2.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: lattifai
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.2.0
|
|
4
4
|
Summary: Lattifai Python SDK: Seamless Integration with Lattifai's Speech and Video AI Services
|
|
5
5
|
Author-email: Lattifai Technologies <tech@lattifai.com>
|
|
6
6
|
Maintainer-email: Lattice <tech@lattifai.com>
|
|
@@ -50,6 +50,7 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
50
50
|
Requires-Python: <3.15,>=3.10
|
|
51
51
|
Description-Content-Type: text/markdown
|
|
52
52
|
License-File: LICENSE
|
|
53
|
+
Requires-Dist: k2py>=0.2.1
|
|
53
54
|
Requires-Dist: lattifai-core>=0.6.0
|
|
54
55
|
Requires-Dist: lattifai-run>=1.0.1
|
|
55
56
|
Requires-Dist: python-dotenv
|
|
@@ -65,6 +66,7 @@ Requires-Dist: scipy!=1.16.3
|
|
|
65
66
|
Requires-Dist: g2p-phonemizer>=0.4.0
|
|
66
67
|
Requires-Dist: av
|
|
67
68
|
Requires-Dist: wtpsplit>=2.1.7
|
|
69
|
+
Requires-Dist: modelscope==1.33.0
|
|
68
70
|
Requires-Dist: OmniSenseVoice>=0.4.2
|
|
69
71
|
Requires-Dist: nemo_toolkit_asr[asr]>=2.7.0rc4
|
|
70
72
|
Requires-Dist: pyannote-audio-notorchdeps>=4.0.2
|
|
@@ -116,6 +118,7 @@ Advanced forced alignment and subtitle generation powered by [ 🤗 Lattice-1](h
|
|
|
116
118
|
|
|
117
119
|
## Table of Contents
|
|
118
120
|
|
|
121
|
+
- [Core Capabilities](#core-capabilities)
|
|
119
122
|
- [Installation](#installation)
|
|
120
123
|
- [Quick Start](#quick-start)
|
|
121
124
|
- [Command Line Interface](#command-line-interface)
|
|
@@ -132,16 +135,45 @@ Advanced forced alignment and subtitle generation powered by [ 🤗 Lattice-1](h
|
|
|
132
135
|
- [YouTube Processing](#youtube-processing)
|
|
133
136
|
- [Configuration Objects](#configuration-objects)
|
|
134
137
|
- [Advanced Features](#advanced-features)
|
|
138
|
+
- [Audio Preprocessing](#audio-preprocessing)
|
|
139
|
+
- [Long-Form Audio Support](#long-form-audio-support)
|
|
135
140
|
- [Word-Level Alignment](#word-level-alignment)
|
|
136
141
|
- [Smart Sentence Splitting](#smart-sentence-splitting)
|
|
137
142
|
- [Speaker Diarization](#speaker-diarization)
|
|
138
143
|
- [YAML Configuration Files](#yaml-configuration-files)
|
|
144
|
+
- [Architecture Overview](#architecture-overview)
|
|
145
|
+
- [Performance & Optimization](#performance--optimization)
|
|
139
146
|
- [Supported Formats](#supported-formats)
|
|
147
|
+
- [Supported Languages](#supported-languages)
|
|
140
148
|
- [Roadmap](#roadmap)
|
|
141
149
|
- [Development](#development)
|
|
142
150
|
|
|
143
151
|
---
|
|
144
152
|
|
|
153
|
+
## Core Capabilities
|
|
154
|
+
|
|
155
|
+
LattifAI provides comprehensive audio-text alignment powered by the Lattice-1 model:
|
|
156
|
+
|
|
157
|
+
| Feature | Description | Status |
|
|
158
|
+
|---------|-------------|--------|
|
|
159
|
+
| **Forced Alignment** | Precise word-level and segment-level synchronization with audio | ✅ Production |
|
|
160
|
+
| **Multi-Model Transcription** | Gemini (100+ languages), Parakeet (24 languages), SenseVoice (5 languages) | ✅ Production |
|
|
161
|
+
| **Speaker Diarization** | Automatic multi-speaker identification with label preservation | ✅ Production |
|
|
162
|
+
| **Audio Preprocessing** | Multi-channel selection, device optimization (CPU/CUDA/MPS) | ✅ Production |
|
|
163
|
+
| **Streaming Mode** | Process audio up to 20 hours with minimal memory footprint | ✅ Production |
|
|
164
|
+
| **Smart Text Processing** | Intelligent sentence splitting and non-speech element separation | ✅ Production |
|
|
165
|
+
| **Universal Format Support** | 30+ caption/subtitle formats with text normalization | ✅ Production |
|
|
166
|
+
| **Configuration System** | YAML-based configs for reproducible workflows | ✅ Production |
|
|
167
|
+
|
|
168
|
+
**Key Highlights:**
|
|
169
|
+
- 🎯 **Accuracy**: State-of-the-art alignment precision with Lattice-1 model
|
|
170
|
+
- 🌍 **Multilingual**: Support for 100+ languages via multiple transcription models
|
|
171
|
+
- 🚀 **Performance**: Hardware-accelerated processing with streaming support
|
|
172
|
+
- 🔧 **Flexible**: CLI, Python SDK, and Web UI interfaces
|
|
173
|
+
- 📦 **Production-Ready**: Battle-tested on diverse audio/video content
|
|
174
|
+
|
|
175
|
+
---
|
|
176
|
+
|
|
145
177
|
## Installation
|
|
146
178
|
|
|
147
179
|
### Step 1: Install SDK
|
|
@@ -149,9 +181,6 @@ Advanced forced alignment and subtitle generation powered by [ 🤗 Lattice-1](h
|
|
|
149
181
|
**Using pip:**
|
|
150
182
|
```bash
|
|
151
183
|
|
|
152
|
-
pip install install-k2
|
|
153
|
-
install-k2 --torch-version 2.9.1 # if not set will auto-detect PyTorch version and install compatible k2
|
|
154
|
-
|
|
155
184
|
pip install lattifai
|
|
156
185
|
```
|
|
157
186
|
|
|
@@ -165,30 +194,11 @@ uv init my-project
|
|
|
165
194
|
cd my-project
|
|
166
195
|
source .venv/bin/activate
|
|
167
196
|
|
|
168
|
-
# Install k2 (required dependency)
|
|
169
|
-
uv pip install install-k2
|
|
170
|
-
uv pip install pip
|
|
171
|
-
uv run install-k2 --torch-version 2.9.1
|
|
172
|
-
|
|
173
197
|
# Install LattifAI
|
|
174
198
|
uv pip install lattifai
|
|
175
199
|
```
|
|
176
200
|
|
|
177
|
-
> **Note**: `install-k2` automatically detects your PyTorch version (up to 2.9) and installs the compatible k2 wheel.
|
|
178
|
-
|
|
179
|
-
<details>
|
|
180
|
-
<summary><b>install-k2 options</b></summary>
|
|
181
201
|
|
|
182
|
-
```
|
|
183
|
-
usage: install-k2 [-h] [--system {linux,darwin,windows}] [--dry-run] [--torch-version TORCH_VERSION]
|
|
184
|
-
|
|
185
|
-
optional arguments:
|
|
186
|
-
-h, help Show this help message and exit
|
|
187
|
-
--system {linux,darwin,windows} Override OS detection
|
|
188
|
-
--dry-run Show what would be installed without making changes
|
|
189
|
-
--torch-version TORCH_VERSION Specify torch version (e.g., 2.8.0)
|
|
190
|
-
```
|
|
191
|
-
</details>
|
|
192
202
|
|
|
193
203
|
### Step 2: Get Your API Key
|
|
194
204
|
|
|
@@ -252,7 +262,7 @@ caption = client.alignment(
|
|
|
252
262
|
|
|
253
263
|
That's it! Your aligned subtitles are saved to `aligned.srt`.
|
|
254
264
|
|
|
255
|
-
### Web Interface
|
|
265
|
+
### 🚧 Web Interface
|
|
256
266
|
|
|
257
267
|

|
|
258
268
|
|
|
@@ -310,13 +320,9 @@ That's it! Your aligned subtitles are saved to `aligned.srt`.
|
|
|
310
320
|
The web interface will automatically open in your browser at `http://localhost:5173`.
|
|
311
321
|
|
|
312
322
|
**Features:**
|
|
313
|
-
- ✅
|
|
314
|
-
- ✅
|
|
315
|
-
- ✅
|
|
316
|
-
- ✅ Multiple subtitle format support
|
|
317
|
-
- ✅ Built-in transcription with multiple models
|
|
318
|
-
- ✅ API key management interface
|
|
319
|
-
- ✅ Download aligned subtitles in various formats
|
|
323
|
+
- ✅ **Drag-and-Drop Upload**: Visual file upload for audio/video and captions
|
|
324
|
+
- ✅ **Real-Time Progress**: Live alignment progress with detailed status
|
|
325
|
+
- ✅ **Multiple Transcription Models**: Gemini, Parakeet, SenseVoice selection
|
|
320
326
|
|
|
321
327
|
---
|
|
322
328
|
|
|
@@ -617,6 +623,78 @@ from lattifai import (
|
|
|
617
623
|
|
|
618
624
|
## Advanced Features
|
|
619
625
|
|
|
626
|
+
### Audio Preprocessing
|
|
627
|
+
|
|
628
|
+
LattifAI provides powerful audio preprocessing capabilities for optimal alignment:
|
|
629
|
+
|
|
630
|
+
**Channel Selection**
|
|
631
|
+
|
|
632
|
+
Control which audio channel to process for stereo/multi-channel files:
|
|
633
|
+
|
|
634
|
+
```python
|
|
635
|
+
from lattifai import LattifAI
|
|
636
|
+
|
|
637
|
+
client = LattifAI()
|
|
638
|
+
|
|
639
|
+
# Use left channel only
|
|
640
|
+
caption = client.alignment(
|
|
641
|
+
input_media="stereo.wav",
|
|
642
|
+
input_caption="subtitle.srt",
|
|
643
|
+
channel_selector="left", # Options: "left", "right", "average", or channel index (0, 1, 2, ...)
|
|
644
|
+
)
|
|
645
|
+
|
|
646
|
+
# Average all channels (default)
|
|
647
|
+
caption = client.alignment(
|
|
648
|
+
input_media="stereo.wav",
|
|
649
|
+
input_caption="subtitle.srt",
|
|
650
|
+
channel_selector="average",
|
|
651
|
+
)
|
|
652
|
+
```
|
|
653
|
+
|
|
654
|
+
**CLI Usage:**
|
|
655
|
+
```bash
|
|
656
|
+
# Use right channel
|
|
657
|
+
lai alignment align audio.wav subtitle.srt output.srt \
|
|
658
|
+
media.channel_selector=right
|
|
659
|
+
|
|
660
|
+
# Use specific channel index
|
|
661
|
+
lai alignment align audio.wav subtitle.srt output.srt \
|
|
662
|
+
media.channel_selector=1
|
|
663
|
+
```
|
|
664
|
+
|
|
665
|
+
**Device Management**
|
|
666
|
+
|
|
667
|
+
Optimize processing for your hardware:
|
|
668
|
+
|
|
669
|
+
```python
|
|
670
|
+
from lattifai import LattifAI, AlignmentConfig
|
|
671
|
+
|
|
672
|
+
# Use CUDA GPU
|
|
673
|
+
client = LattifAI(
|
|
674
|
+
alignment_config=AlignmentConfig(device="cuda")
|
|
675
|
+
)
|
|
676
|
+
|
|
677
|
+
# Use specific GPU
|
|
678
|
+
client = LattifAI(
|
|
679
|
+
alignment_config=AlignmentConfig(device="cuda:0")
|
|
680
|
+
)
|
|
681
|
+
|
|
682
|
+
# Use Apple Silicon MPS
|
|
683
|
+
client = LattifAI(
|
|
684
|
+
alignment_config=AlignmentConfig(device="mps")
|
|
685
|
+
)
|
|
686
|
+
|
|
687
|
+
# Use CPU
|
|
688
|
+
client = LattifAI(
|
|
689
|
+
alignment_config=AlignmentConfig(device="cpu")
|
|
690
|
+
)
|
|
691
|
+
```
|
|
692
|
+
|
|
693
|
+
**Supported Formats**
|
|
694
|
+
- **Audio**: WAV, MP3, M4A, AAC, FLAC, OGG, OPUS, AIFF, and more
|
|
695
|
+
- **Video**: MP4, MKV, MOV, WEBM, AVI, and more
|
|
696
|
+
- All formats supported by FFmpeg are compatible
|
|
697
|
+
|
|
620
698
|
### Long-Form Audio Support
|
|
621
699
|
|
|
622
700
|
LattifAI now supports processing long audio files (up to 20 hours) through streaming mode. Enable streaming by setting the `streaming_chunk_secs` parameter:
|
|
@@ -658,14 +736,18 @@ client = LattifAI(
|
|
|
658
736
|
)
|
|
659
737
|
```
|
|
660
738
|
|
|
661
|
-
**
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
739
|
+
**Technical Details:**
|
|
740
|
+
|
|
741
|
+
| Parameter | Description | Recommendation |
|
|
742
|
+
|-----------|-------------|----------------|
|
|
743
|
+
| **Default Value** | 600 seconds (10 minutes) | Good for most use cases |
|
|
744
|
+
| **Memory Impact** | Lower chunks = less RAM usage | Adjust based on available RAM |
|
|
745
|
+
| **Accuracy Impact** | Virtually zero degradation | Our precise implementation preserves quality |
|
|
746
|
+
|
|
747
|
+
**Performance Characteristics:**
|
|
748
|
+
- ✅ **Near-Perfect Accuracy**: Streaming implementation maintains alignment precision
|
|
749
|
+
- 🚧 **Memory Efficient**: Process 20-hour audio with <10GB RAM (600-sec chunks)
|
|
750
|
+
|
|
669
751
|
|
|
670
752
|
### Word-Level Alignment
|
|
671
753
|
|
|
@@ -708,14 +790,30 @@ caption = client.alignment(
|
|
|
708
790
|
|
|
709
791
|
### Speaker Diarization
|
|
710
792
|
|
|
711
|
-
Speaker diarization automatically identifies and labels different speakers in audio
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
-
|
|
793
|
+
Speaker diarization automatically identifies and labels different speakers in audio using state-of-the-art models.
|
|
794
|
+
|
|
795
|
+
**Core Capabilities:**
|
|
796
|
+
- 🎤 **Multi-Speaker Detection**: Automatically detect speaker changes in audio
|
|
797
|
+
- 🏷️ **Smart Labeling**: Assign speaker labels (SPEAKER_00, SPEAKER_01, etc.)
|
|
798
|
+
- 🔄 **Label Preservation**: Maintain existing speaker names from input captions
|
|
799
|
+
- 🤖 **Gemini Integration**: Extract speaker names intelligently during transcription
|
|
800
|
+
|
|
801
|
+
**How It Works:**
|
|
802
|
+
|
|
803
|
+
1. **Without Existing Labels**: System assigns generic labels (SPEAKER_00, SPEAKER_01)
|
|
804
|
+
2. **With Existing Labels**: System preserves your speaker names during alignment
|
|
805
|
+
- Formats: `[Alice]`, `>> Bob:`, `SPEAKER_01:`, `Alice:` are all recognized
|
|
806
|
+
3. **Gemini Transcription**: When using Gemini models, speaker names are extracted from context
|
|
807
|
+
- Example: "Hi, I'm Alice" → System labels as `Alice` instead of `SPEAKER_00`
|
|
715
808
|
|
|
716
|
-
**Speaker
|
|
717
|
-
|
|
718
|
-
|
|
809
|
+
**Speaker Label Integration:**
|
|
810
|
+
|
|
811
|
+
The diarization engine intelligently matches detected speakers with existing labels:
|
|
812
|
+
- If input captions have speaker names → **Preserved during alignment**
|
|
813
|
+
- If Gemini transcription provides names → **Used for labeling**
|
|
814
|
+
- Otherwise → **Generic labels (SPEAKER_00, etc.) assigned**
|
|
815
|
+
* 🚧 **Future Enhancement:**
|
|
816
|
+
- **AI-Powered Speaker Name Inference**: Upcoming feature will use large language models combined with metadata (video title, description, context) to intelligently infer speaker names, making transcripts more human-readable and contextually accurate
|
|
719
817
|
|
|
720
818
|
**CLI:**
|
|
721
819
|
```bash
|
|
@@ -756,6 +854,8 @@ for segment in caption.supervisions:
|
|
|
756
854
|
|
|
757
855
|
### YAML Configuration Files
|
|
758
856
|
|
|
857
|
+
* **under development**
|
|
858
|
+
|
|
759
859
|
Create reusable configuration files:
|
|
760
860
|
|
|
761
861
|
```yaml
|
|
@@ -772,6 +872,125 @@ lai alignment align audio.wav subtitle.srt output.srt \
|
|
|
772
872
|
|
|
773
873
|
---
|
|
774
874
|
|
|
875
|
+
## Architecture Overview
|
|
876
|
+
|
|
877
|
+
LattifAI uses a modular, config-driven architecture for maximum flexibility:
|
|
878
|
+
|
|
879
|
+
```
|
|
880
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
881
|
+
│ LattifAI Client │
|
|
882
|
+
├─────────────────────────────────────────────────────────────┤
|
|
883
|
+
│ Configuration Layer (Config-Driven) │
|
|
884
|
+
│ ├── ClientConfig (API settings) │
|
|
885
|
+
│ ├── AlignmentConfig (Model & device) │
|
|
886
|
+
│ ├── CaptionConfig (I/O formats) │
|
|
887
|
+
│ ├── TranscriptionConfig (ASR models) │
|
|
888
|
+
│ └── DiarizationConfig (Speaker detection) │
|
|
889
|
+
├─────────────────────────────────────────────────────────────┤
|
|
890
|
+
│ Core Components │
|
|
891
|
+
│ ├── AudioLoader → Load & preprocess audio │
|
|
892
|
+
│ ├── Aligner → Lattice-1 forced alignment │
|
|
893
|
+
│ ├── Transcriber → Multi-model ASR │
|
|
894
|
+
│ ├── Diarizer → Speaker identification │
|
|
895
|
+
│ └── Tokenizer → Intelligent text segmentation │
|
|
896
|
+
├─────────────────────────────────────────────────────────────┤
|
|
897
|
+
│ Data Flow │
|
|
898
|
+
│ Input → AudioLoader → Aligner → Diarizer → Caption │
|
|
899
|
+
│ ↓ │
|
|
900
|
+
│ Transcriber (optional) │
|
|
901
|
+
└─────────────────────────────────────────────────────────────┘
|
|
902
|
+
```
|
|
903
|
+
|
|
904
|
+
**Component Responsibilities:**
|
|
905
|
+
|
|
906
|
+
| Component | Purpose | Configuration |
|
|
907
|
+
|-----------|---------|---------------|
|
|
908
|
+
| **AudioLoader** | Load audio/video, channel selection, format conversion | `MediaConfig` |
|
|
909
|
+
| **Aligner** | Forced alignment using Lattice-1 model | `AlignmentConfig` |
|
|
910
|
+
| **Transcriber** | ASR with Gemini/Parakeet/SenseVoice | `TranscriptionConfig` |
|
|
911
|
+
| **Diarizer** | Speaker diarization with pyannote.audio | `DiarizationConfig` |
|
|
912
|
+
| **Tokenizer** | Sentence splitting and text normalization | `CaptionConfig` |
|
|
913
|
+
| **Caption** | Unified data structure for alignments | `CaptionConfig` |
|
|
914
|
+
|
|
915
|
+
**Data Flow:**
|
|
916
|
+
|
|
917
|
+
1. **Audio Loading**: `AudioLoader` loads media, applies channel selection, converts to numpy array
|
|
918
|
+
2. **Transcription** (optional): `Transcriber` generates transcript if no caption provided
|
|
919
|
+
3. **Text Preprocessing**: `Tokenizer` splits sentences and normalizes text
|
|
920
|
+
4. **Alignment**: `Aligner` uses Lattice-1 to compute word-level timestamps
|
|
921
|
+
5. **Diarization** (optional): `Diarizer` identifies speakers and assigns labels
|
|
922
|
+
6. **Output**: `Caption` object contains all results, exported to desired format
|
|
923
|
+
|
|
924
|
+
**Configuration Philosophy:**
|
|
925
|
+
- ✅ **Declarative**: Describe what you want, not how to do it
|
|
926
|
+
- ✅ **Composable**: Mix and match configurations
|
|
927
|
+
- ✅ **Reproducible**: Save configs to YAML for consistent results
|
|
928
|
+
- ✅ **Flexible**: Override configs per-method or globally
|
|
929
|
+
|
|
930
|
+
---
|
|
931
|
+
|
|
932
|
+
## Performance & Optimization
|
|
933
|
+
|
|
934
|
+
### Device Selection
|
|
935
|
+
|
|
936
|
+
Choose the optimal device for your hardware:
|
|
937
|
+
|
|
938
|
+
```python
|
|
939
|
+
from lattifai import LattifAI, AlignmentConfig
|
|
940
|
+
|
|
941
|
+
# NVIDIA GPU (recommended for speed)
|
|
942
|
+
client = LattifAI(
|
|
943
|
+
alignment_config=AlignmentConfig(device="cuda")
|
|
944
|
+
)
|
|
945
|
+
|
|
946
|
+
# Apple Silicon GPU
|
|
947
|
+
client = LattifAI(
|
|
948
|
+
alignment_config=AlignmentConfig(device="mps")
|
|
949
|
+
)
|
|
950
|
+
|
|
951
|
+
# CPU (maximum compatibility)
|
|
952
|
+
client = LattifAI(
|
|
953
|
+
alignment_config=AlignmentConfig(device="cpu")
|
|
954
|
+
)
|
|
955
|
+
```
|
|
956
|
+
|
|
957
|
+
**Performance Comparison** (30-minute audio):
|
|
958
|
+
|
|
959
|
+
| Device | Time |
|
|
960
|
+
|--------|------|
|
|
961
|
+
| CUDA (RTX 4090) | ~18 sec |
|
|
962
|
+
| MPS (M4) | ~26 sec |
|
|
963
|
+
|
|
964
|
+
### Memory Management
|
|
965
|
+
|
|
966
|
+
**Streaming Mode** for long audio:
|
|
967
|
+
|
|
968
|
+
```python
|
|
969
|
+
# Process 20-hour audio with <10GB RAM
|
|
970
|
+
caption = client.alignment(
|
|
971
|
+
input_media="long_audio.wav",
|
|
972
|
+
input_caption="subtitle.srt",
|
|
973
|
+
streaming_chunk_secs=600.0, # 10-minute chunks
|
|
974
|
+
)
|
|
975
|
+
```
|
|
976
|
+
|
|
977
|
+
**Memory Usage** (approximate):
|
|
978
|
+
|
|
979
|
+
| Chunk Size | Peak RAM | Suitable For |
|
|
980
|
+
|------------|----------|-------------|
|
|
981
|
+
| 600 sec | ~5 GB | Recommended |
|
|
982
|
+
| No streaming | ~10 GB+ | Short audio only |
|
|
983
|
+
|
|
984
|
+
### Optimization Tips
|
|
985
|
+
|
|
986
|
+
1. **Use GPU when available**: 10x faster than CPU
|
|
987
|
+
2. **WIP: Enable streaming for long audio**: Process 20+ hour files without OOM
|
|
988
|
+
3. **Choose appropriate chunk size**: Balance memory vs. performance
|
|
989
|
+
4. **Batch processing**: Process multiple files in sequence (coming soon)
|
|
990
|
+
5. **Profile alignment**: Set `client.profile=True` to identify bottlenecks
|
|
991
|
+
|
|
992
|
+
---
|
|
993
|
+
|
|
775
994
|
## Supported Formats
|
|
776
995
|
|
|
777
996
|
LattifAI supports virtually all common media and subtitle formats:
|
|
@@ -792,14 +1011,77 @@ LattifAI supports virtually all common media and subtitle formats:
|
|
|
792
1011
|
|
|
793
1012
|
---
|
|
794
1013
|
|
|
1014
|
+
## Supported Languages
|
|
1015
|
+
|
|
1016
|
+
LattifAI supports multiple transcription models with different language capabilities:
|
|
1017
|
+
|
|
1018
|
+
### Gemini Models (100+ Languages)
|
|
1019
|
+
|
|
1020
|
+
**Models**: `gemini-2.5-pro`, `gemini-3-pro-preview`, `gemini-3-flash-preview`
|
|
1021
|
+
|
|
1022
|
+
**Supported Languages**: English, Chinese (Mandarin & Cantonese), Spanish, French, German, Italian, Portuguese, Japanese, Korean, Arabic, Russian, Hindi, Bengali, Turkish, Dutch, Polish, Swedish, Danish, Norwegian, Finnish, Greek, Hebrew, Thai, Vietnamese, Indonesian, Malay, Filipino, Ukrainian, Czech, Romanian, Hungarian, Swahili, Tamil, Telugu, Marathi, Gujarati, Kannada, and 70+ more languages.
|
|
1023
|
+
|
|
1024
|
+
> **Note**: Requires Gemini API key from [Google AI Studio](https://aistudio.google.com/apikey)
|
|
1025
|
+
|
|
1026
|
+
### NVIDIA Parakeet (24 European Languages)
|
|
1027
|
+
|
|
1028
|
+
**Model**: `nvidia/parakeet-tdt-0.6b-v3`
|
|
1029
|
+
|
|
1030
|
+
**Supported Languages**:
|
|
1031
|
+
- **Western Europe**: English (en), French (fr), German (de), Spanish (es), Italian (it), Portuguese (pt), Dutch (nl)
|
|
1032
|
+
- **Nordic**: Danish (da), Swedish (sv), Norwegian (no), Finnish (fi)
|
|
1033
|
+
- **Eastern Europe**: Polish (pl), Czech (cs), Slovak (sk), Hungarian (hu), Romanian (ro), Bulgarian (bg), Ukrainian (uk), Russian (ru)
|
|
1034
|
+
- **Others**: Croatian (hr), Estonian (et), Latvian (lv), Lithuanian (lt), Slovenian (sl), Maltese (mt), Greek (el)
|
|
1035
|
+
|
|
1036
|
+
### Alibaba SenseVoice (5 Asian Languages)
|
|
1037
|
+
|
|
1038
|
+
**Model**: `iic/SenseVoiceSmall`
|
|
1039
|
+
|
|
1040
|
+
**Supported Languages**:
|
|
1041
|
+
- Chinese/Mandarin (zh)
|
|
1042
|
+
- English (en)
|
|
1043
|
+
- Japanese (ja)
|
|
1044
|
+
- Korean (ko)
|
|
1045
|
+
- Cantonese (yue)
|
|
1046
|
+
|
|
1047
|
+
### Language Selection
|
|
1048
|
+
|
|
1049
|
+
```python
|
|
1050
|
+
from lattifai import LattifAI, TranscriptionConfig
|
|
1051
|
+
|
|
1052
|
+
# Specify language for transcription
|
|
1053
|
+
client = LattifAI(
|
|
1054
|
+
transcription_config=TranscriptionConfig(
|
|
1055
|
+
model_name="nvidia/parakeet-tdt-0.6b-v3",
|
|
1056
|
+
language="de", # German
|
|
1057
|
+
)
|
|
1058
|
+
)
|
|
1059
|
+
```
|
|
1060
|
+
|
|
1061
|
+
**CLI Usage:**
|
|
1062
|
+
```bash
|
|
1063
|
+
lai transcribe run audio.wav output.srt \
|
|
1064
|
+
transcription.model_name=nvidia/parakeet-tdt-0.6b-v3 \
|
|
1065
|
+
transcription.language=de
|
|
1066
|
+
```
|
|
1067
|
+
|
|
1068
|
+
> **Tip**: Use Gemini models for maximum language coverage, Parakeet for European languages, and SenseVoice for Asian languages.
|
|
1069
|
+
|
|
1070
|
+
---
|
|
1071
|
+
|
|
795
1072
|
## Roadmap
|
|
796
1073
|
|
|
797
1074
|
Visit our [LattifAI roadmap](https://lattifai.com/roadmap) for the latest updates.
|
|
798
1075
|
|
|
799
|
-
| Date | Release | Features |
|
|
1076
|
+
| Date | Model Release | Features |
|
|
800
1077
|
|------|---------|----------|
|
|
801
1078
|
| **Oct 2025** | **Lattice-1-Alpha** | ✅ English forced alignment<br>✅ Multi-format support<br>✅ CPU/GPU optimization |
|
|
802
|
-
| **Nov 2025** | **Lattice-1** | ✅ English + Chinese + German<br>✅ Mixed languages alignment<br
|
|
1079
|
+
| **Nov 2025** | **Lattice-1** | ✅ English + Chinese + German<br>✅ Mixed languages alignment<br>✅ Speaker Diarization<br>✅ Multi-model transcription (Gemini, Parakeet, SenseVoice)<br>✅ Web interface with React<br>🚧 Advanced segmentation strategies (entire/transcription/hybrid)<br>🚧 Audio event detection ([MUSIC], [APPLAUSE], etc.)<br> |
|
|
1080
|
+
| **Q1 2026** | **Lattice-2** | ✅ Streaming mode for long audio<br>🔮 40+ languages support<br>🔮 Real-time alignment |
|
|
1081
|
+
|
|
1082
|
+
|
|
1083
|
+
|
|
1084
|
+
**Legend**: ✅ Released | 🚧 In Development | 📋 Planned | 🔮 Future
|
|
803
1085
|
|
|
804
1086
|
---
|
|
805
1087
|
|
|
@@ -1,17 +1,17 @@
|
|
|
1
|
-
lattifai/__init__.py,sha256=
|
|
2
|
-
lattifai/audio2.py,sha256=
|
|
3
|
-
lattifai/client.py,sha256=
|
|
1
|
+
lattifai/__init__.py,sha256=l7dIodSCVMHUXQkd8BVGBoDdYojBCh_lyBWlVibynk8,2695
|
|
2
|
+
lattifai/audio2.py,sha256=P3N8_BwiscbetzDbkbj-n8BcMu2vWD6-MvtQvGwWWf0,17448
|
|
3
|
+
lattifai/client.py,sha256=Vqg4vY--6tox9Js0qGWlE7IGeHJVyQeYLTXYtlzPk3w,19020
|
|
4
4
|
lattifai/errors.py,sha256=LyWRGVhQ6Ak2CYn9FBYAPRgQ7_VHpxzNsXI31HXD--s,11291
|
|
5
5
|
lattifai/logging.py,sha256=MbUEeOUFlF92pA9v532DiPPWKl03S7UHCJ6Z652cf0w,2860
|
|
6
|
-
lattifai/mixin.py,sha256=
|
|
6
|
+
lattifai/mixin.py,sha256=wdgxEhgxR--dHXmeiJZ4AQDxEjKo49GLYQ0BXJw3qpk,25206
|
|
7
7
|
lattifai/types.py,sha256=SjYBfwrCBOXlICvH04niFQJ7OzTx7oTaa_npfRkB67U,659
|
|
8
|
-
lattifai/utils.py,sha256=
|
|
8
|
+
lattifai/utils.py,sha256=cMiC5CY6gSMtcOtf_wK1BBMBEfHwc5R_S8_NIoVYk6I,5321
|
|
9
9
|
lattifai/alignment/__init__.py,sha256=ehpkKfjNIYUx7_M-RWD_8Efcrzd9bE-NSm0QgMMVLW0,178
|
|
10
|
-
lattifai/alignment/lattice1_aligner.py,sha256=
|
|
11
|
-
lattifai/alignment/lattice1_worker.py,sha256=
|
|
10
|
+
lattifai/alignment/lattice1_aligner.py,sha256=wm1BWNu4h-b507OAvLV0ITi7g0qaWthOPwvzWFHKyZQ,6251
|
|
11
|
+
lattifai/alignment/lattice1_worker.py,sha256=ls2o3pVChB63OQrElJOmHzYIhCkjBFPt8EsLIVR1sJ0,11104
|
|
12
12
|
lattifai/alignment/phonemizer.py,sha256=fbhN2DOl39lW4nQWKzyUUTMUabg7v61lB1kj8SKK-Sw,1761
|
|
13
13
|
lattifai/alignment/segmenter.py,sha256=mzWEQC6hWZtI2mR2WU59W7qLHa7KXy7fdU6991kyUuQ,6276
|
|
14
|
-
lattifai/alignment/tokenizer.py,sha256=
|
|
14
|
+
lattifai/alignment/tokenizer.py,sha256=JY11uEe-v4KQLoHZuaHgdFqgxR3u_1D9ZXXMnB6hA-Q,22994
|
|
15
15
|
lattifai/caption/__init__.py,sha256=6MM_2j6CaqwZ81LfSy4di2EP0ykvheRjMZKAYDx2rQs,477
|
|
16
16
|
lattifai/caption/caption.py,sha256=mZYobxuZ8tkJUkZMVvRTrNeGTdmIZYSXTEySQdaGQd8,54595
|
|
17
17
|
lattifai/caption/gemini_reader.py,sha256=GqY2w78xGYCMDP5kD5WGS8jK0gntel2SK-EPpPKTrwU,15138
|
|
@@ -22,14 +22,14 @@ lattifai/cli/__init__.py,sha256=LafsAf8YfDcfTeJ1IevFcyLm-mNbxpOOnm33OFKtpDM,523
|
|
|
22
22
|
lattifai/cli/alignment.py,sha256=06em-Uaf6NhSz1ce4dwT2r8n56NrtibR7ZsSkmc18Kc,5954
|
|
23
23
|
lattifai/cli/app_installer.py,sha256=gAndH3Yo97fGRDe2CQnGtOgZZ4k3_v5ftcUo5g6xbSA,5884
|
|
24
24
|
lattifai/cli/caption.py,sha256=4qQ9DFhxcfaeFMY0TB5I42x4W_gOo2zY6kjXnHnFDms,6313
|
|
25
|
-
lattifai/cli/diarization.py,sha256=
|
|
25
|
+
lattifai/cli/diarization.py,sha256=GTd2vnTm6cJN6Q3mFP-ShY9bZBl1_zKzWFu-4HHcMzk,4075
|
|
26
26
|
lattifai/cli/server.py,sha256=sXMfOSse9-V79slXUU8FDLeqtI5U9zeU-5YpjTIGyVw,1186
|
|
27
27
|
lattifai/cli/transcribe.py,sha256=_vHzrdaGiPepQWATqvEDYDjwzfVLAd2i8RjOLkvdb0w,8218
|
|
28
|
-
lattifai/cli/youtube.py,sha256
|
|
28
|
+
lattifai/cli/youtube.py,sha256=9M2dpcUCvT7vVbXJCIxJwe9klJXoF2jUeLxiatslYso,6063
|
|
29
29
|
lattifai/config/__init__.py,sha256=Z8OudvS6fgfLNLu_2fvoXartQiYCECOnNfzDt-PfCN4,543
|
|
30
30
|
lattifai/config/alignment.py,sha256=vLiH150YWvBUiVkFOIO-nPXCB-b8fP9iSZgS79k1Qbg,4586
|
|
31
31
|
lattifai/config/caption.py,sha256=AYOyUJ1xZsX8CBZy3GpLitbcCAHcZ9LwXui_v3vtuso,6786
|
|
32
|
-
lattifai/config/client.py,sha256=
|
|
32
|
+
lattifai/config/client.py,sha256=46b816MiYja3Uan_3wjnhtqDr0M6T-FqEygJ3e50IZc,1664
|
|
33
33
|
lattifai/config/diarization.py,sha256=cIkwCfsYqfMns3i6tKWcwBBBkdnhhmB_Eo0TuOPCw9o,2484
|
|
34
34
|
lattifai/config/media.py,sha256=cjM8eGeZ7ELhmy4cCqHAyogeHItaVqMrPzSwwIx79HY,14856
|
|
35
35
|
lattifai/config/transcription.py,sha256=_gPJD6cob_jWNdf841nBHhAqJGCxS6PfSyvx2W_vPcM,3082
|
|
@@ -48,10 +48,10 @@ lattifai/workflow/__init__.py,sha256=GOT9jptXwpIMiNRqJ_LToEt_5Dt0k7XXbLkFzhrl31o
|
|
|
48
48
|
lattifai/workflow/agents.py,sha256=yEOnxnhcTvr1iOhCorNvp8B76P6nQsLRXJCu_rCYFfM,38
|
|
49
49
|
lattifai/workflow/base.py,sha256=8QoVIBZwJfr5mppJbtUFafHv5QR9lL-XrULjTWD0oBg,6257
|
|
50
50
|
lattifai/workflow/file_manager.py,sha256=IUWW838ta83kfwM4gpW83gsD_Tx-pa-L_RWKjiefQbQ,33017
|
|
51
|
-
lattifai/workflow/youtube.py,sha256=
|
|
52
|
-
lattifai-1.
|
|
53
|
-
lattifai-1.
|
|
54
|
-
lattifai-1.
|
|
55
|
-
lattifai-1.
|
|
56
|
-
lattifai-1.
|
|
57
|
-
lattifai-1.
|
|
51
|
+
lattifai/workflow/youtube.py,sha256=0B1l_8gdz_O0cy2c9AY9wRPizESQrpRuCP4rwvWRxLA,23687
|
|
52
|
+
lattifai-1.2.0.dist-info/licenses/LICENSE,sha256=xGMLmdFJy6Jkz3Hd0znyQLmcxC93FSZB5isKnEDMoQQ,1066
|
|
53
|
+
lattifai-1.2.0.dist-info/METADATA,sha256=9iEaT3muzKIUmIvQ0oqg4DhM_CvZ53jHvk97kHfPNlQ,37399
|
|
54
|
+
lattifai-1.2.0.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
|
|
55
|
+
lattifai-1.2.0.dist-info/entry_points.txt,sha256=nHZri2VQkPYEl0tQ0dMYTpVGlCOgVWlDG_JtDR3QXF8,545
|
|
56
|
+
lattifai-1.2.0.dist-info/top_level.txt,sha256=tHSoXF26r-IGfbIP_JoYATqbmf14h5NrnNJGH4j5reI,9
|
|
57
|
+
lattifai-1.2.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|