lattifai 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/__init__.py +0 -25
- lattifai/alignment/lattice1_aligner.py +12 -9
- lattifai/alignment/lattice1_worker.py +124 -155
- lattifai/alignment/segmenter.py +1 -1
- lattifai/alignment/sentence_splitter.py +219 -0
- lattifai/alignment/tokenizer.py +23 -179
- lattifai/audio2.py +1 -1
- lattifai/caption/caption.py +0 -2
- lattifai/caption/gemini_reader.py +151 -60
- lattifai/cli/diarization.py +3 -1
- lattifai/cli/transcribe.py +3 -8
- lattifai/cli/youtube.py +11 -0
- lattifai/client.py +96 -47
- lattifai/config/alignment.py +2 -2
- lattifai/config/client.py +5 -0
- lattifai/mixin.py +17 -8
- lattifai/utils.py +40 -4
- lattifai/workflow/youtube.py +55 -57
- {lattifai-1.1.0.dist-info → lattifai-1.2.1.dist-info}/METADATA +331 -48
- {lattifai-1.1.0.dist-info → lattifai-1.2.1.dist-info}/RECORD +24 -23
- {lattifai-1.1.0.dist-info → lattifai-1.2.1.dist-info}/WHEEL +0 -0
- {lattifai-1.1.0.dist-info → lattifai-1.2.1.dist-info}/entry_points.txt +0 -0
- {lattifai-1.1.0.dist-info → lattifai-1.2.1.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.1.0.dist-info → lattifai-1.2.1.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: lattifai
|
|
3
|
-
Version: 1.1
|
|
3
|
+
Version: 1.2.1
|
|
4
4
|
Summary: Lattifai Python SDK: Seamless Integration with Lattifai's Speech and Video AI Services
|
|
5
5
|
Author-email: Lattifai Technologies <tech@lattifai.com>
|
|
6
6
|
Maintainer-email: Lattice <tech@lattifai.com>
|
|
@@ -50,6 +50,7 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
50
50
|
Requires-Python: <3.15,>=3.10
|
|
51
51
|
Description-Content-Type: text/markdown
|
|
52
52
|
License-File: LICENSE
|
|
53
|
+
Requires-Dist: k2py>=0.2.1
|
|
53
54
|
Requires-Dist: lattifai-core>=0.6.0
|
|
54
55
|
Requires-Dist: lattifai-run>=1.0.1
|
|
55
56
|
Requires-Dist: python-dotenv
|
|
@@ -65,6 +66,8 @@ Requires-Dist: scipy!=1.16.3
|
|
|
65
66
|
Requires-Dist: g2p-phonemizer>=0.4.0
|
|
66
67
|
Requires-Dist: av
|
|
67
68
|
Requires-Dist: wtpsplit>=2.1.7
|
|
69
|
+
Requires-Dist: modelscope==1.33.0
|
|
70
|
+
Requires-Dist: error-align-fix>=0.1.2
|
|
68
71
|
Requires-Dist: OmniSenseVoice>=0.4.2
|
|
69
72
|
Requires-Dist: nemo_toolkit_asr[asr]>=2.7.0rc4
|
|
70
73
|
Requires-Dist: pyannote-audio-notorchdeps>=4.0.2
|
|
@@ -116,6 +119,7 @@ Advanced forced alignment and subtitle generation powered by [ 🤗 Lattice-1](h
|
|
|
116
119
|
|
|
117
120
|
## Table of Contents
|
|
118
121
|
|
|
122
|
+
- [Core Capabilities](#core-capabilities)
|
|
119
123
|
- [Installation](#installation)
|
|
120
124
|
- [Quick Start](#quick-start)
|
|
121
125
|
- [Command Line Interface](#command-line-interface)
|
|
@@ -132,16 +136,45 @@ Advanced forced alignment and subtitle generation powered by [ 🤗 Lattice-1](h
|
|
|
132
136
|
- [YouTube Processing](#youtube-processing)
|
|
133
137
|
- [Configuration Objects](#configuration-objects)
|
|
134
138
|
- [Advanced Features](#advanced-features)
|
|
139
|
+
- [Audio Preprocessing](#audio-preprocessing)
|
|
140
|
+
- [Long-Form Audio Support](#long-form-audio-support)
|
|
135
141
|
- [Word-Level Alignment](#word-level-alignment)
|
|
136
142
|
- [Smart Sentence Splitting](#smart-sentence-splitting)
|
|
137
143
|
- [Speaker Diarization](#speaker-diarization)
|
|
138
144
|
- [YAML Configuration Files](#yaml-configuration-files)
|
|
145
|
+
- [Architecture Overview](#architecture-overview)
|
|
146
|
+
- [Performance & Optimization](#performance--optimization)
|
|
139
147
|
- [Supported Formats](#supported-formats)
|
|
148
|
+
- [Supported Languages](#supported-languages)
|
|
140
149
|
- [Roadmap](#roadmap)
|
|
141
150
|
- [Development](#development)
|
|
142
151
|
|
|
143
152
|
---
|
|
144
153
|
|
|
154
|
+
## Core Capabilities
|
|
155
|
+
|
|
156
|
+
LattifAI provides comprehensive audio-text alignment powered by the Lattice-1 model:
|
|
157
|
+
|
|
158
|
+
| Feature | Description | Status |
|
|
159
|
+
|---------|-------------|--------|
|
|
160
|
+
| **Forced Alignment** | Precise word-level and segment-level synchronization with audio | ✅ Production |
|
|
161
|
+
| **Multi-Model Transcription** | Gemini (100+ languages), Parakeet (24 languages), SenseVoice (5 languages) | ✅ Production |
|
|
162
|
+
| **Speaker Diarization** | Automatic multi-speaker identification with label preservation | ✅ Production |
|
|
163
|
+
| **Audio Preprocessing** | Multi-channel selection, device optimization (CPU/CUDA/MPS) | ✅ Production |
|
|
164
|
+
| **Streaming Mode** | Process audio up to 20 hours with minimal memory footprint | ✅ Production |
|
|
165
|
+
| **Smart Text Processing** | Intelligent sentence splitting and non-speech element separation | ✅ Production |
|
|
166
|
+
| **Universal Format Support** | 30+ caption/subtitle formats with text normalization | ✅ Production |
|
|
167
|
+
| **Configuration System** | YAML-based configs for reproducible workflows | ✅ Production |
|
|
168
|
+
|
|
169
|
+
**Key Highlights:**
|
|
170
|
+
- 🎯 **Accuracy**: State-of-the-art alignment precision with Lattice-1 model
|
|
171
|
+
- 🌍 **Multilingual**: Support for 100+ languages via multiple transcription models
|
|
172
|
+
- 🚀 **Performance**: Hardware-accelerated processing with streaming support
|
|
173
|
+
- 🔧 **Flexible**: CLI, Python SDK, and Web UI interfaces
|
|
174
|
+
- 📦 **Production-Ready**: Battle-tested on diverse audio/video content
|
|
175
|
+
|
|
176
|
+
---
|
|
177
|
+
|
|
145
178
|
## Installation
|
|
146
179
|
|
|
147
180
|
### Step 1: Install SDK
|
|
@@ -149,9 +182,6 @@ Advanced forced alignment and subtitle generation powered by [ 🤗 Lattice-1](h
|
|
|
149
182
|
**Using pip:**
|
|
150
183
|
```bash
|
|
151
184
|
|
|
152
|
-
pip install install-k2
|
|
153
|
-
install-k2 --torch-version 2.9.1 # if not set will auto-detect PyTorch version and install compatible k2
|
|
154
|
-
|
|
155
185
|
pip install lattifai
|
|
156
186
|
```
|
|
157
187
|
|
|
@@ -165,30 +195,11 @@ uv init my-project
|
|
|
165
195
|
cd my-project
|
|
166
196
|
source .venv/bin/activate
|
|
167
197
|
|
|
168
|
-
# Install k2 (required dependency)
|
|
169
|
-
uv pip install install-k2
|
|
170
|
-
uv pip install pip
|
|
171
|
-
uv run install-k2 --torch-version 2.9.1
|
|
172
|
-
|
|
173
198
|
# Install LattifAI
|
|
174
199
|
uv pip install lattifai
|
|
175
200
|
```
|
|
176
201
|
|
|
177
|
-
> **Note**: `install-k2` automatically detects your PyTorch version (up to 2.9) and installs the compatible k2 wheel.
|
|
178
|
-
|
|
179
|
-
<details>
|
|
180
|
-
<summary><b>install-k2 options</b></summary>
|
|
181
202
|
|
|
182
|
-
```
|
|
183
|
-
usage: install-k2 [-h] [--system {linux,darwin,windows}] [--dry-run] [--torch-version TORCH_VERSION]
|
|
184
|
-
|
|
185
|
-
optional arguments:
|
|
186
|
-
-h, help Show this help message and exit
|
|
187
|
-
--system {linux,darwin,windows} Override OS detection
|
|
188
|
-
--dry-run Show what would be installed without making changes
|
|
189
|
-
--torch-version TORCH_VERSION Specify torch version (e.g., 2.8.0)
|
|
190
|
-
```
|
|
191
|
-
</details>
|
|
192
203
|
|
|
193
204
|
### Step 2: Get Your API Key
|
|
194
205
|
|
|
@@ -252,7 +263,7 @@ caption = client.alignment(
|
|
|
252
263
|
|
|
253
264
|
That's it! Your aligned subtitles are saved to `aligned.srt`.
|
|
254
265
|
|
|
255
|
-
### Web Interface
|
|
266
|
+
### 🚧 Web Interface
|
|
256
267
|
|
|
257
268
|

|
|
258
269
|
|
|
@@ -310,13 +321,9 @@ That's it! Your aligned subtitles are saved to `aligned.srt`.
|
|
|
310
321
|
The web interface will automatically open in your browser at `http://localhost:5173`.
|
|
311
322
|
|
|
312
323
|
**Features:**
|
|
313
|
-
- ✅
|
|
314
|
-
- ✅
|
|
315
|
-
- ✅
|
|
316
|
-
- ✅ Multiple subtitle format support
|
|
317
|
-
- ✅ Built-in transcription with multiple models
|
|
318
|
-
- ✅ API key management interface
|
|
319
|
-
- ✅ Download aligned subtitles in various formats
|
|
324
|
+
- ✅ **Drag-and-Drop Upload**: Visual file upload for audio/video and captions
|
|
325
|
+
- ✅ **Real-Time Progress**: Live alignment progress with detailed status
|
|
326
|
+
- ✅ **Multiple Transcription Models**: Gemini, Parakeet, SenseVoice selection
|
|
320
327
|
|
|
321
328
|
---
|
|
322
329
|
|
|
@@ -617,6 +624,78 @@ from lattifai import (
|
|
|
617
624
|
|
|
618
625
|
## Advanced Features
|
|
619
626
|
|
|
627
|
+
### Audio Preprocessing
|
|
628
|
+
|
|
629
|
+
LattifAI provides powerful audio preprocessing capabilities for optimal alignment:
|
|
630
|
+
|
|
631
|
+
**Channel Selection**
|
|
632
|
+
|
|
633
|
+
Control which audio channel to process for stereo/multi-channel files:
|
|
634
|
+
|
|
635
|
+
```python
|
|
636
|
+
from lattifai import LattifAI
|
|
637
|
+
|
|
638
|
+
client = LattifAI()
|
|
639
|
+
|
|
640
|
+
# Use left channel only
|
|
641
|
+
caption = client.alignment(
|
|
642
|
+
input_media="stereo.wav",
|
|
643
|
+
input_caption="subtitle.srt",
|
|
644
|
+
channel_selector="left", # Options: "left", "right", "average", or channel index (0, 1, 2, ...)
|
|
645
|
+
)
|
|
646
|
+
|
|
647
|
+
# Average all channels (default)
|
|
648
|
+
caption = client.alignment(
|
|
649
|
+
input_media="stereo.wav",
|
|
650
|
+
input_caption="subtitle.srt",
|
|
651
|
+
channel_selector="average",
|
|
652
|
+
)
|
|
653
|
+
```
|
|
654
|
+
|
|
655
|
+
**CLI Usage:**
|
|
656
|
+
```bash
|
|
657
|
+
# Use right channel
|
|
658
|
+
lai alignment align audio.wav subtitle.srt output.srt \
|
|
659
|
+
media.channel_selector=right
|
|
660
|
+
|
|
661
|
+
# Use specific channel index
|
|
662
|
+
lai alignment align audio.wav subtitle.srt output.srt \
|
|
663
|
+
media.channel_selector=1
|
|
664
|
+
```
|
|
665
|
+
|
|
666
|
+
**Device Management**
|
|
667
|
+
|
|
668
|
+
Optimize processing for your hardware:
|
|
669
|
+
|
|
670
|
+
```python
|
|
671
|
+
from lattifai import LattifAI, AlignmentConfig
|
|
672
|
+
|
|
673
|
+
# Use CUDA GPU
|
|
674
|
+
client = LattifAI(
|
|
675
|
+
alignment_config=AlignmentConfig(device="cuda")
|
|
676
|
+
)
|
|
677
|
+
|
|
678
|
+
# Use specific GPU
|
|
679
|
+
client = LattifAI(
|
|
680
|
+
alignment_config=AlignmentConfig(device="cuda:0")
|
|
681
|
+
)
|
|
682
|
+
|
|
683
|
+
# Use Apple Silicon MPS
|
|
684
|
+
client = LattifAI(
|
|
685
|
+
alignment_config=AlignmentConfig(device="mps")
|
|
686
|
+
)
|
|
687
|
+
|
|
688
|
+
# Use CPU
|
|
689
|
+
client = LattifAI(
|
|
690
|
+
alignment_config=AlignmentConfig(device="cpu")
|
|
691
|
+
)
|
|
692
|
+
```
|
|
693
|
+
|
|
694
|
+
**Supported Formats**
|
|
695
|
+
- **Audio**: WAV, MP3, M4A, AAC, FLAC, OGG, OPUS, AIFF, and more
|
|
696
|
+
- **Video**: MP4, MKV, MOV, WEBM, AVI, and more
|
|
697
|
+
- All formats supported by FFmpeg are compatible
|
|
698
|
+
|
|
620
699
|
### Long-Form Audio Support
|
|
621
700
|
|
|
622
701
|
LattifAI now supports processing long audio files (up to 20 hours) through streaming mode. Enable streaming by setting the `streaming_chunk_secs` parameter:
|
|
@@ -658,14 +737,18 @@ client = LattifAI(
|
|
|
658
737
|
)
|
|
659
738
|
```
|
|
660
739
|
|
|
661
|
-
**
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
740
|
+
**Technical Details:**
|
|
741
|
+
|
|
742
|
+
| Parameter | Description | Recommendation |
|
|
743
|
+
|-----------|-------------|----------------|
|
|
744
|
+
| **Default Value** | 600 seconds (10 minutes) | Good for most use cases |
|
|
745
|
+
| **Memory Impact** | Lower chunks = less RAM usage | Adjust based on available RAM |
|
|
746
|
+
| **Accuracy Impact** | Virtually zero degradation | Our precise implementation preserves quality |
|
|
747
|
+
|
|
748
|
+
**Performance Characteristics:**
|
|
749
|
+
- ✅ **Near-Perfect Accuracy**: Streaming implementation maintains alignment precision
|
|
750
|
+
- 🚧 **Memory Efficient**: Process 20-hour audio with <10GB RAM (600-sec chunks)
|
|
751
|
+
|
|
669
752
|
|
|
670
753
|
### Word-Level Alignment
|
|
671
754
|
|
|
@@ -708,14 +791,30 @@ caption = client.alignment(
|
|
|
708
791
|
|
|
709
792
|
### Speaker Diarization
|
|
710
793
|
|
|
711
|
-
Speaker diarization automatically identifies and labels different speakers in audio
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
-
|
|
794
|
+
Speaker diarization automatically identifies and labels different speakers in audio using state-of-the-art models.
|
|
795
|
+
|
|
796
|
+
**Core Capabilities:**
|
|
797
|
+
- 🎤 **Multi-Speaker Detection**: Automatically detect speaker changes in audio
|
|
798
|
+
- 🏷️ **Smart Labeling**: Assign speaker labels (SPEAKER_00, SPEAKER_01, etc.)
|
|
799
|
+
- 🔄 **Label Preservation**: Maintain existing speaker names from input captions
|
|
800
|
+
- 🤖 **Gemini Integration**: Extract speaker names intelligently during transcription
|
|
801
|
+
|
|
802
|
+
**How It Works:**
|
|
803
|
+
|
|
804
|
+
1. **Without Existing Labels**: System assigns generic labels (SPEAKER_00, SPEAKER_01)
|
|
805
|
+
2. **With Existing Labels**: System preserves your speaker names during alignment
|
|
806
|
+
- Formats: `[Alice]`, `>> Bob:`, `SPEAKER_01:`, `Alice:` are all recognized
|
|
807
|
+
3. **Gemini Transcription**: When using Gemini models, speaker names are extracted from context
|
|
808
|
+
- Example: "Hi, I'm Alice" → System labels as `Alice` instead of `SPEAKER_00`
|
|
715
809
|
|
|
716
|
-
**Speaker
|
|
717
|
-
|
|
718
|
-
|
|
810
|
+
**Speaker Label Integration:**
|
|
811
|
+
|
|
812
|
+
The diarization engine intelligently matches detected speakers with existing labels:
|
|
813
|
+
- If input captions have speaker names → **Preserved during alignment**
|
|
814
|
+
- If Gemini transcription provides names → **Used for labeling**
|
|
815
|
+
- Otherwise → **Generic labels (SPEAKER_00, etc.) assigned**
|
|
816
|
+
* 🚧 **Future Enhancement:**
|
|
817
|
+
- **AI-Powered Speaker Name Inference**: Upcoming feature will use large language models combined with metadata (video title, description, context) to intelligently infer speaker names, making transcripts more human-readable and contextually accurate
|
|
719
818
|
|
|
720
819
|
**CLI:**
|
|
721
820
|
```bash
|
|
@@ -756,6 +855,8 @@ for segment in caption.supervisions:
|
|
|
756
855
|
|
|
757
856
|
### YAML Configuration Files
|
|
758
857
|
|
|
858
|
+
* **under development**
|
|
859
|
+
|
|
759
860
|
Create reusable configuration files:
|
|
760
861
|
|
|
761
862
|
```yaml
|
|
@@ -772,6 +873,125 @@ lai alignment align audio.wav subtitle.srt output.srt \
|
|
|
772
873
|
|
|
773
874
|
---
|
|
774
875
|
|
|
876
|
+
## Architecture Overview
|
|
877
|
+
|
|
878
|
+
LattifAI uses a modular, config-driven architecture for maximum flexibility:
|
|
879
|
+
|
|
880
|
+
```
|
|
881
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
882
|
+
│ LattifAI Client │
|
|
883
|
+
├─────────────────────────────────────────────────────────────┤
|
|
884
|
+
│ Configuration Layer (Config-Driven) │
|
|
885
|
+
│ ├── ClientConfig (API settings) │
|
|
886
|
+
│ ├── AlignmentConfig (Model & device) │
|
|
887
|
+
│ ├── CaptionConfig (I/O formats) │
|
|
888
|
+
│ ├── TranscriptionConfig (ASR models) │
|
|
889
|
+
│ └── DiarizationConfig (Speaker detection) │
|
|
890
|
+
├─────────────────────────────────────────────────────────────┤
|
|
891
|
+
│ Core Components │
|
|
892
|
+
│ ├── AudioLoader → Load & preprocess audio │
|
|
893
|
+
│ ├── Aligner → Lattice-1 forced alignment │
|
|
894
|
+
│ ├── Transcriber → Multi-model ASR │
|
|
895
|
+
│ ├── Diarizer → Speaker identification │
|
|
896
|
+
│ └── Tokenizer → Intelligent text segmentation │
|
|
897
|
+
├─────────────────────────────────────────────────────────────┤
|
|
898
|
+
│ Data Flow │
|
|
899
|
+
│ Input → AudioLoader → Aligner → Diarizer → Caption │
|
|
900
|
+
│ ↓ │
|
|
901
|
+
│ Transcriber (optional) │
|
|
902
|
+
└─────────────────────────────────────────────────────────────┘
|
|
903
|
+
```
|
|
904
|
+
|
|
905
|
+
**Component Responsibilities:**
|
|
906
|
+
|
|
907
|
+
| Component | Purpose | Configuration |
|
|
908
|
+
|-----------|---------|---------------|
|
|
909
|
+
| **AudioLoader** | Load audio/video, channel selection, format conversion | `MediaConfig` |
|
|
910
|
+
| **Aligner** | Forced alignment using Lattice-1 model | `AlignmentConfig` |
|
|
911
|
+
| **Transcriber** | ASR with Gemini/Parakeet/SenseVoice | `TranscriptionConfig` |
|
|
912
|
+
| **Diarizer** | Speaker diarization with pyannote.audio | `DiarizationConfig` |
|
|
913
|
+
| **Tokenizer** | Sentence splitting and text normalization | `CaptionConfig` |
|
|
914
|
+
| **Caption** | Unified data structure for alignments | `CaptionConfig` |
|
|
915
|
+
|
|
916
|
+
**Data Flow:**
|
|
917
|
+
|
|
918
|
+
1. **Audio Loading**: `AudioLoader` loads media, applies channel selection, converts to numpy array
|
|
919
|
+
2. **Transcription** (optional): `Transcriber` generates transcript if no caption provided
|
|
920
|
+
3. **Text Preprocessing**: `Tokenizer` splits sentences and normalizes text
|
|
921
|
+
4. **Alignment**: `Aligner` uses Lattice-1 to compute word-level timestamps
|
|
922
|
+
5. **Diarization** (optional): `Diarizer` identifies speakers and assigns labels
|
|
923
|
+
6. **Output**: `Caption` object contains all results, exported to desired format
|
|
924
|
+
|
|
925
|
+
**Configuration Philosophy:**
|
|
926
|
+
- ✅ **Declarative**: Describe what you want, not how to do it
|
|
927
|
+
- ✅ **Composable**: Mix and match configurations
|
|
928
|
+
- ✅ **Reproducible**: Save configs to YAML for consistent results
|
|
929
|
+
- ✅ **Flexible**: Override configs per-method or globally
|
|
930
|
+
|
|
931
|
+
---
|
|
932
|
+
|
|
933
|
+
## Performance & Optimization
|
|
934
|
+
|
|
935
|
+
### Device Selection
|
|
936
|
+
|
|
937
|
+
Choose the optimal device for your hardware:
|
|
938
|
+
|
|
939
|
+
```python
|
|
940
|
+
from lattifai import LattifAI, AlignmentConfig
|
|
941
|
+
|
|
942
|
+
# NVIDIA GPU (recommended for speed)
|
|
943
|
+
client = LattifAI(
|
|
944
|
+
alignment_config=AlignmentConfig(device="cuda")
|
|
945
|
+
)
|
|
946
|
+
|
|
947
|
+
# Apple Silicon GPU
|
|
948
|
+
client = LattifAI(
|
|
949
|
+
alignment_config=AlignmentConfig(device="mps")
|
|
950
|
+
)
|
|
951
|
+
|
|
952
|
+
# CPU (maximum compatibility)
|
|
953
|
+
client = LattifAI(
|
|
954
|
+
alignment_config=AlignmentConfig(device="cpu")
|
|
955
|
+
)
|
|
956
|
+
```
|
|
957
|
+
|
|
958
|
+
**Performance Comparison** (30-minute audio):
|
|
959
|
+
|
|
960
|
+
| Device | Time |
|
|
961
|
+
|--------|------|
|
|
962
|
+
| CUDA (RTX 4090) | ~18 sec |
|
|
963
|
+
| MPS (M4) | ~26 sec |
|
|
964
|
+
|
|
965
|
+
### Memory Management
|
|
966
|
+
|
|
967
|
+
**Streaming Mode** for long audio:
|
|
968
|
+
|
|
969
|
+
```python
|
|
970
|
+
# Process 20-hour audio with <10GB RAM
|
|
971
|
+
caption = client.alignment(
|
|
972
|
+
input_media="long_audio.wav",
|
|
973
|
+
input_caption="subtitle.srt",
|
|
974
|
+
streaming_chunk_secs=600.0, # 10-minute chunks
|
|
975
|
+
)
|
|
976
|
+
```
|
|
977
|
+
|
|
978
|
+
**Memory Usage** (approximate):
|
|
979
|
+
|
|
980
|
+
| Chunk Size | Peak RAM | Suitable For |
|
|
981
|
+
|------------|----------|-------------|
|
|
982
|
+
| 600 sec | ~5 GB | Recommended |
|
|
983
|
+
| No streaming | ~10 GB+ | Short audio only |
|
|
984
|
+
|
|
985
|
+
### Optimization Tips
|
|
986
|
+
|
|
987
|
+
1. **Use GPU when available**: 10x faster than CPU
|
|
988
|
+
2. **WIP: Enable streaming for long audio**: Process 20+ hour files without OOM
|
|
989
|
+
3. **Choose appropriate chunk size**: Balance memory vs. performance
|
|
990
|
+
4. **Batch processing**: Process multiple files in sequence (coming soon)
|
|
991
|
+
5. **Profile alignment**: Set `client.profile=True` to identify bottlenecks
|
|
992
|
+
|
|
993
|
+
---
|
|
994
|
+
|
|
775
995
|
## Supported Formats
|
|
776
996
|
|
|
777
997
|
LattifAI supports virtually all common media and subtitle formats:
|
|
@@ -792,14 +1012,77 @@ LattifAI supports virtually all common media and subtitle formats:
|
|
|
792
1012
|
|
|
793
1013
|
---
|
|
794
1014
|
|
|
1015
|
+
## Supported Languages
|
|
1016
|
+
|
|
1017
|
+
LattifAI supports multiple transcription models with different language capabilities:
|
|
1018
|
+
|
|
1019
|
+
### Gemini Models (100+ Languages)
|
|
1020
|
+
|
|
1021
|
+
**Models**: `gemini-2.5-pro`, `gemini-3-pro-preview`, `gemini-3-flash-preview`
|
|
1022
|
+
|
|
1023
|
+
**Supported Languages**: English, Chinese (Mandarin & Cantonese), Spanish, French, German, Italian, Portuguese, Japanese, Korean, Arabic, Russian, Hindi, Bengali, Turkish, Dutch, Polish, Swedish, Danish, Norwegian, Finnish, Greek, Hebrew, Thai, Vietnamese, Indonesian, Malay, Filipino, Ukrainian, Czech, Romanian, Hungarian, Swahili, Tamil, Telugu, Marathi, Gujarati, Kannada, and 70+ more languages.
|
|
1024
|
+
|
|
1025
|
+
> **Note**: Requires Gemini API key from [Google AI Studio](https://aistudio.google.com/apikey)
|
|
1026
|
+
|
|
1027
|
+
### NVIDIA Parakeet (24 European Languages)
|
|
1028
|
+
|
|
1029
|
+
**Model**: `nvidia/parakeet-tdt-0.6b-v3`
|
|
1030
|
+
|
|
1031
|
+
**Supported Languages**:
|
|
1032
|
+
- **Western Europe**: English (en), French (fr), German (de), Spanish (es), Italian (it), Portuguese (pt), Dutch (nl)
|
|
1033
|
+
- **Nordic**: Danish (da), Swedish (sv), Norwegian (no), Finnish (fi)
|
|
1034
|
+
- **Eastern Europe**: Polish (pl), Czech (cs), Slovak (sk), Hungarian (hu), Romanian (ro), Bulgarian (bg), Ukrainian (uk), Russian (ru)
|
|
1035
|
+
- **Others**: Croatian (hr), Estonian (et), Latvian (lv), Lithuanian (lt), Slovenian (sl), Maltese (mt), Greek (el)
|
|
1036
|
+
|
|
1037
|
+
### Alibaba SenseVoice (5 Asian Languages)
|
|
1038
|
+
|
|
1039
|
+
**Model**: `iic/SenseVoiceSmall`
|
|
1040
|
+
|
|
1041
|
+
**Supported Languages**:
|
|
1042
|
+
- Chinese/Mandarin (zh)
|
|
1043
|
+
- English (en)
|
|
1044
|
+
- Japanese (ja)
|
|
1045
|
+
- Korean (ko)
|
|
1046
|
+
- Cantonese (yue)
|
|
1047
|
+
|
|
1048
|
+
### Language Selection
|
|
1049
|
+
|
|
1050
|
+
```python
|
|
1051
|
+
from lattifai import LattifAI, TranscriptionConfig
|
|
1052
|
+
|
|
1053
|
+
# Specify language for transcription
|
|
1054
|
+
client = LattifAI(
|
|
1055
|
+
transcription_config=TranscriptionConfig(
|
|
1056
|
+
model_name="nvidia/parakeet-tdt-0.6b-v3",
|
|
1057
|
+
language="de", # German
|
|
1058
|
+
)
|
|
1059
|
+
)
|
|
1060
|
+
```
|
|
1061
|
+
|
|
1062
|
+
**CLI Usage:**
|
|
1063
|
+
```bash
|
|
1064
|
+
lai transcribe run audio.wav output.srt \
|
|
1065
|
+
transcription.model_name=nvidia/parakeet-tdt-0.6b-v3 \
|
|
1066
|
+
transcription.language=de
|
|
1067
|
+
```
|
|
1068
|
+
|
|
1069
|
+
> **Tip**: Use Gemini models for maximum language coverage, Parakeet for European languages, and SenseVoice for Asian languages.
|
|
1070
|
+
|
|
1071
|
+
---
|
|
1072
|
+
|
|
795
1073
|
## Roadmap
|
|
796
1074
|
|
|
797
1075
|
Visit our [LattifAI roadmap](https://lattifai.com/roadmap) for the latest updates.
|
|
798
1076
|
|
|
799
|
-
| Date | Release | Features |
|
|
1077
|
+
| Date | Model Release | Features |
|
|
800
1078
|
|------|---------|----------|
|
|
801
1079
|
| **Oct 2025** | **Lattice-1-Alpha** | ✅ English forced alignment<br>✅ Multi-format support<br>✅ CPU/GPU optimization |
|
|
802
|
-
| **Nov 2025** | **Lattice-1** | ✅ English + Chinese + German<br>✅ Mixed languages alignment<br
|
|
1080
|
+
| **Nov 2025** | **Lattice-1** | ✅ English + Chinese + German<br>✅ Mixed languages alignment<br>✅ Speaker Diarization<br>✅ Multi-model transcription (Gemini, Parakeet, SenseVoice)<br>✅ Web interface with React<br>🚧 Advanced segmentation strategies (entire/transcription/hybrid)<br>🚧 Audio event detection ([MUSIC], [APPLAUSE], etc.)<br> |
|
|
1081
|
+
| **Q1 2026** | **Lattice-2** | ✅ Streaming mode for long audio<br>🔮 40+ languages support<br>🔮 Real-time alignment |
|
|
1082
|
+
|
|
1083
|
+
|
|
1084
|
+
|
|
1085
|
+
**Legend**: ✅ Released | 🚧 In Development | 📋 Planned | 🔮 Future
|
|
803
1086
|
|
|
804
1087
|
---
|
|
805
1088
|
|
|
@@ -1,20 +1,21 @@
|
|
|
1
|
-
lattifai/__init__.py,sha256=
|
|
2
|
-
lattifai/audio2.py,sha256=
|
|
3
|
-
lattifai/client.py,sha256=
|
|
1
|
+
lattifai/__init__.py,sha256=RXa1IK8Qt6jsAnLlxecOCZmREqv2naXx6T1Fy0g6pqU,1953
|
|
2
|
+
lattifai/audio2.py,sha256=P3N8_BwiscbetzDbkbj-n8BcMu2vWD6-MvtQvGwWWf0,17448
|
|
3
|
+
lattifai/client.py,sha256=7I3tUtW8fkhUY1G7vjIuYPdqYGcgw6BbCIjjBarhlyM,21318
|
|
4
4
|
lattifai/errors.py,sha256=LyWRGVhQ6Ak2CYn9FBYAPRgQ7_VHpxzNsXI31HXD--s,11291
|
|
5
5
|
lattifai/logging.py,sha256=MbUEeOUFlF92pA9v532DiPPWKl03S7UHCJ6Z652cf0w,2860
|
|
6
|
-
lattifai/mixin.py,sha256=
|
|
6
|
+
lattifai/mixin.py,sha256=PRBRkEGmlWSpLx_qyN0uWxPoJ0MT9Fr_unFkBSjglaU,25516
|
|
7
7
|
lattifai/types.py,sha256=SjYBfwrCBOXlICvH04niFQJ7OzTx7oTaa_npfRkB67U,659
|
|
8
|
-
lattifai/utils.py,sha256=
|
|
8
|
+
lattifai/utils.py,sha256=ZvgJIM4N11BfD8wEyjoz4K_XzcXPxSRoXoU15oy1-vg,8192
|
|
9
9
|
lattifai/alignment/__init__.py,sha256=ehpkKfjNIYUx7_M-RWD_8Efcrzd9bE-NSm0QgMMVLW0,178
|
|
10
|
-
lattifai/alignment/lattice1_aligner.py,sha256=
|
|
11
|
-
lattifai/alignment/lattice1_worker.py,sha256=
|
|
10
|
+
lattifai/alignment/lattice1_aligner.py,sha256=098liE2Tvb01X5rz6iZWtokSOvfnCydjQiEkKdZeMc8,6245
|
|
11
|
+
lattifai/alignment/lattice1_worker.py,sha256=hQbZTgncPq3n-b_l-gUPDPfm460EwuZTKveErgWLWNk,10891
|
|
12
12
|
lattifai/alignment/phonemizer.py,sha256=fbhN2DOl39lW4nQWKzyUUTMUabg7v61lB1kj8SKK-Sw,1761
|
|
13
|
-
lattifai/alignment/segmenter.py,sha256=
|
|
14
|
-
lattifai/alignment/
|
|
13
|
+
lattifai/alignment/segmenter.py,sha256=0s0eABe0rLAo7eNfl0l5e_knxmZba_BjabPdqsRD45E,6284
|
|
14
|
+
lattifai/alignment/sentence_splitter.py,sha256=lwT9ZrvcuM0c9lzLCydHEbAw-TO4Z5u6zZPen-yUPUg,9090
|
|
15
|
+
lattifai/alignment/tokenizer.py,sha256=rewOcpSv6UxgC3VmuCzMyUIlHVZhJB-FbXKKk7DGNMI,15673
|
|
15
16
|
lattifai/caption/__init__.py,sha256=6MM_2j6CaqwZ81LfSy4di2EP0ykvheRjMZKAYDx2rQs,477
|
|
16
|
-
lattifai/caption/caption.py,sha256=
|
|
17
|
-
lattifai/caption/gemini_reader.py,sha256=
|
|
17
|
+
lattifai/caption/caption.py,sha256=LB6JdKovadrLOudKeQihloLik6xMYg_nj2a8g6Dg7GY,54593
|
|
18
|
+
lattifai/caption/gemini_reader.py,sha256=jD18RqOrFWYA6b2-5yZQcZEy39hu1OU7gb9i43oo0rc,19930
|
|
18
19
|
lattifai/caption/gemini_writer.py,sha256=sYPxYEmVQcEan5WVGgSrcraxs3QJRQRh8CJkl2yUQ1s,6515
|
|
19
20
|
lattifai/caption/supervision.py,sha256=DRrM8lfKU_x9aVBcLG6xnT0xIJrnc8jzHpzcSwQOg8c,905
|
|
20
21
|
lattifai/caption/text_parser.py,sha256=XDb8KTt031uJ1hg6dpbINglGOTX-6pBcghbg3DULM1I,4633
|
|
@@ -22,14 +23,14 @@ lattifai/cli/__init__.py,sha256=LafsAf8YfDcfTeJ1IevFcyLm-mNbxpOOnm33OFKtpDM,523
|
|
|
22
23
|
lattifai/cli/alignment.py,sha256=06em-Uaf6NhSz1ce4dwT2r8n56NrtibR7ZsSkmc18Kc,5954
|
|
23
24
|
lattifai/cli/app_installer.py,sha256=gAndH3Yo97fGRDe2CQnGtOgZZ4k3_v5ftcUo5g6xbSA,5884
|
|
24
25
|
lattifai/cli/caption.py,sha256=4qQ9DFhxcfaeFMY0TB5I42x4W_gOo2zY6kjXnHnFDms,6313
|
|
25
|
-
lattifai/cli/diarization.py,sha256=
|
|
26
|
+
lattifai/cli/diarization.py,sha256=GTd2vnTm6cJN6Q3mFP-ShY9bZBl1_zKzWFu-4HHcMzk,4075
|
|
26
27
|
lattifai/cli/server.py,sha256=sXMfOSse9-V79slXUU8FDLeqtI5U9zeU-5YpjTIGyVw,1186
|
|
27
|
-
lattifai/cli/transcribe.py,sha256=
|
|
28
|
-
lattifai/cli/youtube.py,sha256
|
|
28
|
+
lattifai/cli/transcribe.py,sha256=1bKnFOxyO8KHbmtdrJC8ZEjBAnbuWhtejILOp9PkptQ,8047
|
|
29
|
+
lattifai/cli/youtube.py,sha256=9M2dpcUCvT7vVbXJCIxJwe9klJXoF2jUeLxiatslYso,6063
|
|
29
30
|
lattifai/config/__init__.py,sha256=Z8OudvS6fgfLNLu_2fvoXartQiYCECOnNfzDt-PfCN4,543
|
|
30
|
-
lattifai/config/alignment.py,sha256=
|
|
31
|
+
lattifai/config/alignment.py,sha256=dB-sX0ZnsCy7O2cX9NnU5UQ5aFaPANaCozESKv_k_vY,4620
|
|
31
32
|
lattifai/config/caption.py,sha256=AYOyUJ1xZsX8CBZy3GpLitbcCAHcZ9LwXui_v3vtuso,6786
|
|
32
|
-
lattifai/config/client.py,sha256=
|
|
33
|
+
lattifai/config/client.py,sha256=46b816MiYja3Uan_3wjnhtqDr0M6T-FqEygJ3e50IZc,1664
|
|
33
34
|
lattifai/config/diarization.py,sha256=cIkwCfsYqfMns3i6tKWcwBBBkdnhhmB_Eo0TuOPCw9o,2484
|
|
34
35
|
lattifai/config/media.py,sha256=cjM8eGeZ7ELhmy4cCqHAyogeHItaVqMrPzSwwIx79HY,14856
|
|
35
36
|
lattifai/config/transcription.py,sha256=_gPJD6cob_jWNdf841nBHhAqJGCxS6PfSyvx2W_vPcM,3082
|
|
@@ -48,10 +49,10 @@ lattifai/workflow/__init__.py,sha256=GOT9jptXwpIMiNRqJ_LToEt_5Dt0k7XXbLkFzhrl31o
|
|
|
48
49
|
lattifai/workflow/agents.py,sha256=yEOnxnhcTvr1iOhCorNvp8B76P6nQsLRXJCu_rCYFfM,38
|
|
49
50
|
lattifai/workflow/base.py,sha256=8QoVIBZwJfr5mppJbtUFafHv5QR9lL-XrULjTWD0oBg,6257
|
|
50
51
|
lattifai/workflow/file_manager.py,sha256=IUWW838ta83kfwM4gpW83gsD_Tx-pa-L_RWKjiefQbQ,33017
|
|
51
|
-
lattifai/workflow/youtube.py,sha256=
|
|
52
|
-
lattifai-1.1.
|
|
53
|
-
lattifai-1.1.
|
|
54
|
-
lattifai-1.1.
|
|
55
|
-
lattifai-1.1.
|
|
56
|
-
lattifai-1.1.
|
|
57
|
-
lattifai-1.1.
|
|
52
|
+
lattifai/workflow/youtube.py,sha256=0B1l_8gdz_O0cy2c9AY9wRPizESQrpRuCP4rwvWRxLA,23687
|
|
53
|
+
lattifai-1.2.1.dist-info/licenses/LICENSE,sha256=xGMLmdFJy6Jkz3Hd0znyQLmcxC93FSZB5isKnEDMoQQ,1066
|
|
54
|
+
lattifai-1.2.1.dist-info/METADATA,sha256=qw4slozNhu8oHN4Hku7XP_wJ9IuKZsNCqTyGbm3E9oM,37437
|
|
55
|
+
lattifai-1.2.1.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
|
|
56
|
+
lattifai-1.2.1.dist-info/entry_points.txt,sha256=nHZri2VQkPYEl0tQ0dMYTpVGlCOgVWlDG_JtDR3QXF8,545
|
|
57
|
+
lattifai-1.2.1.dist-info/top_level.txt,sha256=tHSoXF26r-IGfbIP_JoYATqbmf14h5NrnNJGH4j5reI,9
|
|
58
|
+
lattifai-1.2.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|