lattifai 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. lattifai/_init.py +20 -0
  2. lattifai/alignment/__init__.py +9 -1
  3. lattifai/alignment/lattice1_aligner.py +175 -54
  4. lattifai/alignment/lattice1_worker.py +47 -4
  5. lattifai/alignment/punctuation.py +38 -0
  6. lattifai/alignment/segmenter.py +3 -2
  7. lattifai/alignment/text_align.py +441 -0
  8. lattifai/alignment/tokenizer.py +134 -65
  9. lattifai/audio2.py +162 -183
  10. lattifai/cli/__init__.py +2 -1
  11. lattifai/cli/alignment.py +5 -0
  12. lattifai/cli/caption.py +111 -4
  13. lattifai/cli/transcribe.py +2 -6
  14. lattifai/cli/youtube.py +7 -1
  15. lattifai/client.py +72 -123
  16. lattifai/config/__init__.py +28 -0
  17. lattifai/config/alignment.py +14 -0
  18. lattifai/config/caption.py +45 -31
  19. lattifai/config/client.py +16 -0
  20. lattifai/config/event.py +102 -0
  21. lattifai/config/media.py +20 -0
  22. lattifai/config/transcription.py +25 -1
  23. lattifai/data/__init__.py +8 -0
  24. lattifai/data/caption.py +228 -0
  25. lattifai/diarization/__init__.py +41 -1
  26. lattifai/errors.py +78 -53
  27. lattifai/event/__init__.py +65 -0
  28. lattifai/event/lattifai.py +166 -0
  29. lattifai/mixin.py +49 -32
  30. lattifai/transcription/base.py +8 -2
  31. lattifai/transcription/gemini.py +147 -16
  32. lattifai/transcription/lattifai.py +25 -63
  33. lattifai/types.py +1 -1
  34. lattifai/utils.py +7 -13
  35. lattifai/workflow/__init__.py +28 -4
  36. lattifai/workflow/file_manager.py +2 -5
  37. lattifai/youtube/__init__.py +43 -0
  38. lattifai/youtube/client.py +1265 -0
  39. lattifai/youtube/types.py +23 -0
  40. lattifai-1.3.0.dist-info/METADATA +678 -0
  41. lattifai-1.3.0.dist-info/RECORD +57 -0
  42. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/entry_points.txt +1 -2
  43. lattifai/__init__.py +0 -88
  44. lattifai/alignment/sentence_splitter.py +0 -219
  45. lattifai/caption/__init__.py +0 -20
  46. lattifai/caption/caption.py +0 -1467
  47. lattifai/caption/gemini_reader.py +0 -462
  48. lattifai/caption/gemini_writer.py +0 -173
  49. lattifai/caption/supervision.py +0 -34
  50. lattifai/caption/text_parser.py +0 -145
  51. lattifai/cli/app_installer.py +0 -142
  52. lattifai/cli/server.py +0 -44
  53. lattifai/server/app.py +0 -427
  54. lattifai/workflow/youtube.py +0 -577
  55. lattifai-1.2.1.dist-info/METADATA +0 -1134
  56. lattifai-1.2.1.dist-info/RECORD +0 -58
  57. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/WHEEL +0 -0
  58. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/licenses/LICENSE +0 -0
  59. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,23 @@
1
+ from dataclasses import dataclass
2
+ from typing import Any, List, Optional
3
+
4
+
5
+ @dataclass
6
+ class VideoMetadata:
7
+ video_id: str
8
+ title: str
9
+ description: str
10
+ duration: float # seconds
11
+ thumbnail_url: str
12
+ channel_name: str
13
+ view_count: int
14
+ upload_date: Optional[str] = None
15
+
16
+
17
+ @dataclass
18
+ class CaptionTrack:
19
+ language_code: str
20
+ language_name: str
21
+ kind: str # 'manual' | 'asr'
22
+ ext: str # 'vtt', 'srv3' etc
23
+ url: Optional[str] = None
@@ -0,0 +1,678 @@
1
+ Metadata-Version: 2.4
2
+ Name: lattifai
3
+ Version: 1.3.0
4
+ Summary: Lattifai Python SDK: Seamless Integration with Lattifai's Speech and Video AI Services
5
+ Author-email: Lattifai Technologies <tech@lattifai.com>
6
+ Maintainer-email: Lattice <tech@lattifai.com>
7
+ License: MIT License
8
+
9
+ Copyright (c) 2025 LattifAI.
10
+
11
+ Permission is hereby granted, free of charge, to any person obtaining a copy
12
+ of this software and associated documentation files (the "Software"), to deal
13
+ in the Software without restriction, including without limitation the rights
14
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15
+ copies of the Software, and to permit persons to whom the Software is
16
+ furnished to do so, subject to the following conditions:
17
+
18
+ The above copyright notice and this permission notice shall be included in all
19
+ copies or substantial portions of the Software.
20
+
21
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27
+ SOFTWARE.
28
+
29
+ Project-URL: Homepage, https://github.com/lattifai/lattifai-python
30
+ Project-URL: Documentation, https://github.com/lattifai/lattifai-python/blob/main/README.md
31
+ Project-URL: Bug Tracker, https://github.com/lattifai/lattifai-python/issues
32
+ Project-URL: Discussions, https://github.com/lattifai/lattifai-python/discussions
33
+ Project-URL: Changelog, https://github.com/lattifai/lattifai-python/blob/main/CHANGELOG.md
34
+ Keywords: lattifai,speech recognition,video analysis,ai,sdk,api client
35
+ Classifier: Development Status :: 5 - Production/Stable
36
+ Classifier: Intended Audience :: Developers
37
+ Classifier: Intended Audience :: Science/Research
38
+ Classifier: License :: OSI Approved :: Apache Software License
39
+ Classifier: Programming Language :: Python :: 3.10
40
+ Classifier: Programming Language :: Python :: 3.11
41
+ Classifier: Programming Language :: Python :: 3.12
42
+ Classifier: Programming Language :: Python :: 3.13
43
+ Classifier: Programming Language :: Python :: 3.14
44
+ Classifier: Operating System :: MacOS :: MacOS X
45
+ Classifier: Operating System :: POSIX :: Linux
46
+ Classifier: Operating System :: Microsoft :: Windows
47
+ Classifier: Topic :: Multimedia :: Sound/Audio
48
+ Classifier: Topic :: Multimedia :: Video
49
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
50
+ Requires-Python: <3.15,>=3.10
51
+ Description-Content-Type: text/markdown
52
+ License-File: LICENSE
53
+ Requires-Dist: python-dotenv
54
+ Requires-Dist: colorful>=0.5.6
55
+ Requires-Dist: lattifai-run>=1.0.1
56
+ Requires-Dist: lattifai-captions[splitting]>=0.1.6
57
+ Requires-Dist: lattifai-core-hq>=0.6.4
58
+ Requires-Dist: g2p-phonemizer>=0.4.0
59
+ Requires-Dist: error-align-fix>=0.1.4
60
+ Requires-Dist: lhotse>=1.26.0
61
+ Requires-Dist: k2py==0.2.4
62
+ Requires-Dist: onnxruntime
63
+ Requires-Dist: av
64
+ Requires-Dist: msgpack
65
+ Provides-Extra: event
66
+ Requires-Dist: pyannote-audio-notorchdeps>=4.0.2; extra == "event"
67
+ Provides-Extra: diarization
68
+ Requires-Dist: nemo_toolkit_asr[asr]>=2.7.0rc4; extra == "diarization"
69
+ Requires-Dist: pyannote-audio-notorchdeps>=4.0.2; extra == "diarization"
70
+ Provides-Extra: transcription
71
+ Requires-Dist: OmniSenseVoice>=0.4.2; extra == "transcription"
72
+ Requires-Dist: nemo_toolkit_asr[asr]>=2.7.0rc4; extra == "transcription"
73
+ Requires-Dist: google-genai>=1.22.0; extra == "transcription"
74
+ Requires-Dist: pyannote-audio-notorchdeps>=4.0.2; extra == "transcription"
75
+ Provides-Extra: youtube
76
+ Requires-Dist: questionary>=2.0; extra == "youtube"
77
+ Requires-Dist: yt-dlp; extra == "youtube"
78
+ Requires-Dist: pycryptodome; extra == "youtube"
79
+ Provides-Extra: dev
80
+ Requires-Dist: black; extra == "dev"
81
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
82
+ Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
83
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
84
+ Provides-Extra: all
85
+ Requires-Dist: lattifai[transcription]; extra == "all"
86
+ Requires-Dist: lattifai[youtube]; extra == "all"
87
+ Dynamic: license-file
88
+
89
+ <div align="center">
90
+ <img src="https://raw.githubusercontent.com/lattifai/lattifai-python/main/assets/logo.png" width=256>
91
+
92
+ [![PyPI version](https://badge.fury.io/py/lattifai.svg)](https://badge.fury.io/py/lattifai)
93
+ [![Python Versions](https://img.shields.io/pypi/pyversions/lattifai.svg)](https://pypi.org/project/lattifai)
94
+ [![PyPI Status](https://pepy.tech/badge/lattifai)](https://pepy.tech/project/lattifai)
95
+ </div>
96
+
97
+ <p align="center">
98
+ 🌐 <a href="https://lattifai.com"><b>Official Website</b></a> &nbsp;&nbsp; | &nbsp;&nbsp; 🖥️ <a href="https://github.com/lattifai/lattifai-python">GitHub</a> &nbsp;&nbsp; | &nbsp;&nbsp; 🤗 <a href="https://huggingface.co/LattifAI/Lattice-1">Model</a> &nbsp;&nbsp; | &nbsp;&nbsp; 📑 <a href="https://lattifai.com/blogs">Blog</a> &nbsp;&nbsp; | &nbsp;&nbsp; <a href="https://discord.gg/kvF4WsBRK8"><img src="https://img.shields.io/badge/Discord-Join-5865F2?logo=discord&logoColor=white" alt="Discord" style="vertical-align: middle;"></a>
99
+ </p>
100
+
101
+
102
+ # LattifAI: Precision Alignment, Infinite Possibilities
103
+
104
+ Advanced forced alignment and subtitle generation powered by [ 🤗 Lattice-1](https://huggingface.co/LattifAI/Lattice-1) model.
105
+
106
+ ## Table of Contents
107
+
108
+ - [Features](#features)
109
+ - [Installation](#installation)
110
+ - [Quick Start](#quick-start)
111
+ - [CLI Reference](#cli-reference)
112
+ - [Python SDK](#python-sdk)
113
+ - [Advanced Features](#advanced-features)
114
+ - [Text Processing](#text-processing)
115
+ - [Supported Formats & Languages](#supported-formats--languages)
116
+ - [Roadmap](#roadmap)
117
+ - [Development](#development)
118
+
119
+ ---
120
+
121
+ ## Features
122
+
123
+ | Feature | Description |
124
+ |---------|-------------|
125
+ | **Forced Alignment** | Word-level and segment-level audio-text synchronization powered by [Lattice-1](https://huggingface.co/LattifAI/Lattice-1) |
126
+ | **Multi-Model Transcription** | Gemini (100+ languages), Parakeet (24 languages), SenseVoice (5 languages) |
127
+ | **Speaker Diarization** | Multi-speaker identification with label preservation |
128
+ | **Streaming Mode** | Process audio up to 20 hours with minimal memory |
129
+ | **Universal Format Support** | 30+ caption/subtitle formats |
130
+
131
+ ### Alignment Models
132
+
133
+ | Model | Links | Languages | Description |
134
+ |-------|-------|-----------|-------------|
135
+ | **Lattice-1** | [🤗 HF](https://huggingface.co/LattifAI/Lattice-1) • [🤖 MS](https://modelscope.cn/models/LattifAI/Lattice-1) | English, Chinese, German | Production model with mixed-language alignment support |
136
+ | **Lattice-1-Alpha** | [🤗 HF](https://huggingface.co/LattifAI/Lattice-1-Alpha) • [🤖 MS](https://modelscope.cn/models/LattifAI/Lattice-1-Alpha) | English | Initial release with English forced alignment |
137
+
138
+ **Model Hub**: Models can be downloaded from `huggingface` (default) or `modelscope` (recommended for users in China):
139
+
140
+ ```bash
141
+ # Use ModelScope (faster in China)
142
+ lai alignment align audio.wav caption.srt output.srt alignment.model_hub=modelscope
143
+ ```
144
+
145
+ ```python
146
+ from lattifai.client import LattifAI
147
+ from lattifai.config import AlignmentConfig
148
+
149
+ client = LattifAI(alignment_config=AlignmentConfig(model_hub="modelscope"))
150
+ ```
151
+
152
+ ---
153
+
154
+ ## Installation
155
+
156
+ ### Using uv (Recommended)
157
+
158
+ [uv](https://github.com/astral-sh/uv) is a fast Python package manager (10-100x faster than pip).
159
+
160
+ ```bash
161
+ # Install uv
162
+ curl -LsSf https://astral.sh/uv/install.sh | sh
163
+
164
+ # Quick start (run without installing)
165
+ uvx --from lattifai lai --help
166
+
167
+ # Or create a project
168
+ mkdir my-project && cd my-project
169
+ uv init --bare && uv add "lattifai[all]"
170
+ uv run lai alignment align audio.wav caption.srt output.srt
171
+ ```
172
+
173
+ ### Using pip
174
+
175
+ ```bash
176
+ # Full installation (recommended)
177
+ pip install "lattifai[all]"
178
+ ```
179
+
180
+ ### Installation Options
181
+
182
+ | Extra | Command | Includes |
183
+ |-------|---------|----------|
184
+ | (base) | `pip install lattifai` | Forced alignment (Lattice-1, k2py, ONNX, captions) |
185
+ | `all` | `pip install "lattifai[all]"` | Base + transcription + youtube |
186
+ | `transcription` | `pip install "lattifai[transcription]"` | ASR models (Gemini, Parakeet, SenseVoice) |
187
+ | `youtube` | `pip install "lattifai[youtube]"` | YouTube download (yt-dlp) |
188
+ | `diarization` | `pip install "lattifai[diarization]"` | Speaker diarization (NeMo, pyannote) |
189
+ | `event` | `pip install "lattifai[event]"` | Audio event detection |
190
+
191
+ **Note:** Base installation includes full alignment functionality. Use `[all]` for transcription and YouTube features.
192
+
193
+ ### Caption Format Support
194
+
195
+ Caption/subtitle format parsing is provided by [lattifai-captions](https://github.com/lattifai/captions), a separate package supporting 30+ formats (SRT, VTT, ASS, TTML, TextGrid, NLE formats, etc.). It is automatically installed with `lattifai[core]` or `lattifai[all]`.
196
+
197
+ ### API Keys
198
+
199
+ **LattifAI API Key (Required)** - Get your free key at [lattifai.com/dashboard/api-keys](https://lattifai.com/dashboard/api-keys)
200
+
201
+ ```bash
202
+ export LATTIFAI_API_KEY="lf_your_api_key_here"
203
+ ```
204
+
205
+ **Gemini API Key (Optional)** - For transcription with Gemini models, get key at [aistudio.google.com/apikey](https://aistudio.google.com/apikey)
206
+
207
+ ```bash
208
+ export GEMINI_API_KEY="your_gemini_api_key_here"
209
+ ```
210
+
211
+ Or use a `.env` file:
212
+ ```bash
213
+ LATTIFAI_API_KEY=lf_your_api_key_here
214
+ GEMINI_API_KEY=your_gemini_api_key_here
215
+ ```
216
+
217
+ ---
218
+
219
+ ## Quick Start
220
+
221
+ ### Command Line
222
+
223
+ ```bash
224
+ # Align audio with subtitle
225
+ lai alignment align audio.wav subtitle.srt output.srt
226
+
227
+ # YouTube video
228
+ lai alignment youtube "https://youtube.com/watch?v=VIDEO_ID"
229
+ ```
230
+
231
+ ### Python SDK
232
+
233
+ ```python
234
+ from lattifai.client import LattifAI
235
+
236
+ client = LattifAI()
237
+ caption = client.alignment(
238
+ input_media="audio.wav",
239
+ input_caption="subtitle.srt",
240
+ output_caption_path="aligned.srt",
241
+ )
242
+ ```
243
+
244
+ ---
245
+
246
+ ## CLI Reference
247
+
248
+ | Command | Description | Example |
249
+ |---------|-------------|---------|
250
+ | `lai alignment align` | Align audio/video with caption | `lai alignment align audio.wav caption.srt output.srt` |
251
+ | `lai alignment youtube` | Download & align YouTube | `lai alignment youtube "https://youtube.com/watch?v=ID"` |
252
+ | `lai transcribe run` | Transcribe audio/video | `lai transcribe run audio.wav output.srt` |
253
+ | `lai transcribe align` | Transcribe and align | `lai transcribe align audio.wav output.srt` |
254
+ | `lai caption convert` | Convert caption formats | `lai caption convert input.srt output.vtt` |
255
+ | `lai caption shift` | Shift timestamps | `lai caption shift input.srt output.srt 2.0` |
256
+
257
+ ### Common Options
258
+
259
+ ```bash
260
+ # Device selection
261
+ alignment.device=cuda # cuda, mps, cpu
262
+
263
+ # Caption options
264
+ caption.split_sentence=true # Smart sentence splitting
265
+ caption.word_level=true # Word-level timestamps
266
+
267
+ # Streaming for long audio
268
+ media.streaming_chunk_secs=600
269
+
270
+ # Channel selection
271
+ media.channel_selector=left # left, right, average, or index
272
+ ```
273
+
274
+ ### Transcription Models
275
+
276
+ ```bash
277
+ # Gemini (100+ languages, requires GEMINI_API_KEY)
278
+ transcription.model_name=gemini-2.5-pro
279
+
280
+ # Parakeet (24 European languages)
281
+ transcription.model_name=nvidia/parakeet-tdt-0.6b-v3
282
+
283
+ # SenseVoice (zh, en, ja, ko, yue)
284
+ transcription.model_name=iic/SenseVoiceSmall
285
+ ```
286
+
287
+ ### lai transcribe run
288
+
289
+ Transcribe audio/video files or YouTube URLs to generate timestamped captions.
290
+
291
+ ```bash
292
+ # Local file
293
+ lai transcribe run audio.wav output.srt
294
+
295
+ # YouTube URL
296
+ lai transcribe run "https://youtube.com/watch?v=VIDEO_ID" output_dir=./output
297
+
298
+ # With model selection
299
+ lai transcribe run audio.wav output.srt \
300
+ transcription.model_name=gemini-2.5-pro \
301
+ transcription.device=cuda
302
+ ```
303
+
304
+ **Parameters:**
305
+ - `input`: Path to audio/video file or YouTube URL
306
+ - `output_caption`: Output caption file path (for local files)
307
+ - `output_dir`: Output directory (for YouTube URLs, defaults to current directory)
308
+ - `channel_selector`: Audio channel - `average` (default), `left`, `right`, or channel index
309
+
310
+ ### lai transcribe align
311
+
312
+ Transcribe and align in a single step - produces precisely aligned captions.
313
+
314
+ ```bash
315
+ # Basic usage
316
+ lai transcribe align audio.wav output.srt
317
+
318
+ # With options
319
+ lai transcribe align audio.wav output.srt \
320
+ transcription.model_name=nvidia/parakeet-tdt-0.6b-v3 \
321
+ alignment.device=cuda \
322
+ caption.split_sentence=true \
323
+ caption.word_level=true
324
+ ```
325
+
326
+ ---
327
+
328
+ ## Python SDK
329
+
330
+ ### Configuration Objects
331
+
332
+ ```python
333
+ from lattifai.client import LattifAI
334
+ from lattifai.config import (
335
+ ClientConfig,
336
+ AlignmentConfig,
337
+ CaptionConfig,
338
+ DiarizationConfig,
339
+ MediaConfig,
340
+ )
341
+
342
+ client = LattifAI(
343
+ client_config=ClientConfig(api_key="lf_xxx", timeout=60.0),
344
+ alignment_config=AlignmentConfig(device="cuda"),
345
+ caption_config=CaptionConfig(split_sentence=True, word_level=True),
346
+ )
347
+
348
+ caption = client.alignment(
349
+ input_media="audio.wav",
350
+ input_caption="subtitle.srt",
351
+ output_caption_path="output.json",
352
+ )
353
+
354
+ # Access results
355
+ for segment in caption.supervisions:
356
+ print(f"{segment.start:.2f}s - {segment.end:.2f}s: {segment.text}")
357
+ ```
358
+
359
+ ### YouTube Processing
360
+
361
+ ```python
362
+ caption = client.youtube(
363
+ url="https://youtube.com/watch?v=VIDEO_ID",
364
+ output_dir="./downloads",
365
+ output_caption_path="aligned.srt",
366
+ )
367
+ ```
368
+
369
+ ### CaptionConfig Options
370
+
371
+ | Option | Default | Description |
372
+ |--------|---------|-------------|
373
+ | `split_sentence` | `False` | Smart sentence splitting, separates non-speech elements |
374
+ | `word_level` | `False` | Include word-level timestamps in output |
375
+ | `normalize_text` | `True` | Clean HTML entities and special characters |
376
+ | `include_speaker_in_text` | `True` | Include speaker labels in text output |
377
+
378
+ ```python
379
+ from lattifai.client import LattifAI
380
+ from lattifai.config import CaptionConfig
381
+
382
+ client = LattifAI(
383
+ caption_config=CaptionConfig(
384
+ split_sentence=True,
385
+ word_level=True,
386
+ normalize_text=True,
387
+ include_speaker_in_text=False,
388
+ )
389
+ )
390
+ ```
391
+
392
+ ---
393
+
394
+ ## Advanced Features
395
+
396
+ ### Streaming Mode (Long Audio)
397
+
398
+ Process audio up to 20 hours with minimal memory:
399
+
400
+ ```python
401
+ caption = client.alignment(
402
+ input_media="long_audio.wav",
403
+ input_caption="subtitle.srt",
404
+ streaming_chunk_secs=600.0, # 10-minute chunks
405
+ )
406
+ ```
407
+
408
+ ### Word-Level Alignment
409
+
410
+ ```python
411
+ from lattifai.client import LattifAI
412
+ from lattifai.config import CaptionConfig
413
+
414
+ client = LattifAI(caption_config=CaptionConfig(word_level=True))
415
+ caption = client.alignment(
416
+ input_media="audio.wav",
417
+ input_caption="subtitle.srt",
418
+ output_caption_path="output.json", # JSON preserves word-level data
419
+ )
420
+ ```
421
+
422
+ ### Speaker Diarization
423
+
424
+ Automatically identify and label different speakers in audio.
425
+
426
+ **Capabilities:**
427
+ - **Multi-Speaker Detection**: Automatically detect speaker changes
428
+ - **Smart Labeling**: Assign labels (SPEAKER_00, SPEAKER_01, etc.)
429
+ - **Label Preservation**: Maintain existing speaker names from input captions
430
+ - **Gemini Integration**: Extract speaker names from transcription context
431
+
432
+ **Label Handling:**
433
+ - Without existing labels → Generic labels (SPEAKER_00, SPEAKER_01)
434
+ - With existing labels (`[Alice]`, `>> Bob:`, `SPEAKER_01:`) → Preserved during alignment
435
+ - Gemini transcription → Names extracted from context (e.g., "Hi, I'm Alice" → `Alice`)
436
+
437
+ ```python
438
+ from lattifai.client import LattifAI
439
+ from lattifai.config import DiarizationConfig
440
+
441
+ client = LattifAI(
442
+ diarization_config=DiarizationConfig(
443
+ enabled=True,
444
+ device="cuda",
445
+ min_speakers=2,
446
+ max_speakers=4,
447
+ )
448
+ )
449
+ caption = client.alignment(...)
450
+
451
+ for segment in caption.supervisions:
452
+ print(f"[{segment.speaker}] {segment.text}")
453
+ ```
454
+
455
+ **CLI:**
456
+ ```bash
457
+ lai alignment align audio.wav subtitle.srt output.srt \
458
+ diarization.enabled=true \
459
+ diarization.device=cuda
460
+ ```
461
+
462
+ ### Data Flow
463
+
464
+ ```
465
+ Input Media → AudioLoader → Aligner → (Diarizer) → Caption
466
+
467
+ Input Caption → Reader → Tokenizer
468
+ ```
469
+
470
+ ---
471
+
472
+ ## Text Processing
473
+
474
+ The tokenizer handles various text patterns for forced alignment.
475
+
476
+ ### Bracket/Caption Handling
477
+
478
+ Visual captions and annotations in brackets are treated specially - they get **two pronunciation paths** so the aligner can choose:
479
+ 1. **Silence path** - skip when content doesn't appear in audio
480
+ 2. **Inner text pronunciation** - match if someone actually says the words
481
+
482
+ | Bracket Type | Symbol | Example | Alignment Behavior |
483
+ |--------------|--------|---------|-------------------|
484
+ | Half-width square | `[]` | `[APPLAUSE]` | Skip or match "applause" |
485
+ | Half-width paren | `()` | `(music)` | Skip or match "music" |
486
+ | Full-width square | `【】` | `【笑声】` | Skip or match "笑声" |
487
+ | Full-width paren | `()` | `(音乐)` | Skip or match "音乐" |
488
+ | Angle brackets | `<>` | `<intro>` | Skip or match "intro" |
489
+ | Book title marks | `《》` | `《开场白》` | Skip or match "开场白" |
490
+
491
+ This allows proper handling of:
492
+ - **Visual descriptions**: `[Barret adjusts the camera and smiles]` → skipped if not spoken
493
+ - **Sound effects**: `[APPLAUSE]`, `(music)` → matched if audible
494
+ - **Chinese annotations**: `【笑声】`, `(鼓掌)` → flexible alignment
495
+
496
+ ### Multilingual Text
497
+
498
+ | Pattern | Handling | Example |
499
+ |---------|----------|---------|
500
+ | CJK characters | Split individually | `你好` → `["你", "好"]` |
501
+ | Latin words | Grouped with accents | `Kühlschrank` → `["Kühlschrank"]` |
502
+ | Contractions | Kept together | `I'm`, `don't`, `we'll` |
503
+ | Punctuation | Attached to words | `Hello,` `world!` |
504
+
505
+ ### Speaker Labels
506
+
507
+ Recognized speaker patterns are preserved during alignment:
508
+
509
+ | Format | Example | Output |
510
+ |--------|---------|--------|
511
+ | Arrow prefix | `>> Alice:` or `&gt;&gt; Alice:` | `[Alice]` |
512
+ | LattifAI format | `[SPEAKER_01]:` | `[SPEAKER_01]` |
513
+ | Uppercase name | `SPEAKER NAME:` | `[SPEAKER NAME]` |
514
+
515
+ ---
516
+
517
+ ## Supported Formats & Languages
518
+
519
+ ### Media Formats
520
+
521
+ | Type | Formats |
522
+ |------|---------|
523
+ | **Audio** | WAV, MP3, M4A, AAC, FLAC, OGG, OPUS, AIFF, and more |
524
+ | **Video** | MP4, MKV, MOV, WEBM, AVI, and more |
525
+ | **Caption** | SRT, VTT, ASS, SSA, SRV3, JSON, TextGrid, TSV, CSV, LRC, TTML, and more |
526
+
527
+ > **Note**: Caption format handling is provided by [lattifai-captions](https://github.com/lattifai/captions), which is automatically installed as a dependency. For standalone caption processing without alignment features, install `pip install lattifai-captions`.
528
+
529
+ ### JSON Format
530
+
531
+ JSON is the most flexible format for storing caption data with full word-level timing support:
532
+
533
+ ```json
534
+ [
535
+ {
536
+ "text": "Hello beautiful world",
537
+ "start": 0.0,
538
+ "end": 2.5,
539
+ "speaker": "Speaker 1",
540
+ "words": [
541
+ {"word": "Hello", "start": 0.0, "end": 0.5},
542
+ {"word": "beautiful", "start": 0.6, "end": 1.4},
543
+ {"word": "world", "start": 1.5, "end": 2.5}
544
+ ]
545
+ }
546
+ ]
547
+ ```
548
+
549
+ **Features:**
550
+ - Word-level timestamps preserved in `words` array
551
+ - Round-trip compatible (read/write without data loss)
552
+ - Optional `speaker` field for multi-speaker content
553
+
554
+ ### Word-Level and Karaoke Output
555
+
556
+ | Format | `word_level=True` | `word_level=True` + `karaoke=True` |
557
+ |--------|-------------------|-----------------------------------|
558
+ | **JSON** | Includes `words` array | Same as word_level=True |
559
+ | **SRT** | One word per segment | One word per segment |
560
+ | **VTT** | One word per segment | YouTube VTT style: `<00:00:00.000><c> word</c>` |
561
+ | **ASS** | One word per segment | `{\kf}` karaoke tags (sweep effect) |
562
+ | **LRC** | One word per line | Enhanced `<timestamp>` tags |
563
+ | **TTML** | One word per `<p>` element | `<span>` with `itunes:timing="Word"` |
564
+
565
+ ### VTT Format (YouTube VTT Support)
566
+
567
+ The VTT format handler supports both standard WebVTT and YouTube VTT with word-level timestamps.
568
+
569
+ **Reading**: VTT automatically detects YouTube VTT format (with `<timestamp><c>` tags) and extracts word-level alignment data:
570
+
571
+ ```
572
+ WEBVTT
573
+
574
+ 00:00:00.000 --> 00:00:02.000
575
+ <00:00:00.000><c> Hello</c><00:00:00.500><c> world</c>
576
+ ```
577
+
578
+ **Writing**: Use `word_level=True` with `karaoke_config` to output YouTube VTT style:
579
+
580
+ ```python
581
+ from lattifai.caption import Caption
582
+ from lattifai.caption.config import KaraokeConfig
583
+
584
+ caption = Caption.read("input.vtt")
585
+ caption.write(
586
+ "output.vtt",
587
+ word_level=True,
588
+ karaoke_config=KaraokeConfig(enabled=True)
589
+ )
590
+ ```
591
+
592
+ ```bash
593
+ # CLI: Convert to YouTube VTT with word-level timestamps
594
+ lai caption convert input.json output.vtt \
595
+ caption.word_level=true \
596
+ caption.karaoke.enabled=true
597
+ ```
598
+
599
+ ### Transcription Language Support
600
+
601
+ #### Gemini Models (100+ Languages)
602
+
603
+ **Models**: `gemini-2.5-pro`, `gemini-3-pro-preview`, `gemini-3-flash-preview`
604
+
605
+ English, Chinese (Mandarin & Cantonese), Spanish, French, German, Italian, Portuguese, Japanese, Korean, Arabic, Russian, Hindi, Bengali, Turkish, Dutch, Polish, Swedish, Danish, Norwegian, Finnish, Greek, Hebrew, Thai, Vietnamese, Indonesian, Malay, Filipino, Ukrainian, Czech, Romanian, Hungarian, and 70+ more.
606
+
607
+ > Requires Gemini API key from [Google AI Studio](https://aistudio.google.com/apikey)
608
+
609
+ #### NVIDIA Parakeet (24 European Languages)
610
+
611
+ **Model**: `nvidia/parakeet-tdt-0.6b-v3`
612
+
613
+ | Region | Languages |
614
+ |--------|-----------|
615
+ | Western Europe | English (en), French (fr), German (de), Spanish (es), Italian (it), Portuguese (pt), Dutch (nl) |
616
+ | Nordic | Danish (da), Swedish (sv), Norwegian (no), Finnish (fi) |
617
+ | Eastern Europe | Polish (pl), Czech (cs), Slovak (sk), Hungarian (hu), Romanian (ro), Bulgarian (bg), Ukrainian (uk), Russian (ru) |
618
+ | Others | Croatian (hr), Estonian (et), Latvian (lv), Lithuanian (lt), Slovenian (sl), Maltese (mt), Greek (el) |
619
+
620
+ #### Alibaba SenseVoice (5 Asian Languages)
621
+
622
+ **Model**: `iic/SenseVoiceSmall`
623
+
624
+ Chinese/Mandarin (zh), English (en), Japanese (ja), Korean (ko), Cantonese (yue)
625
+
626
+ ---
627
+
628
+ ## Roadmap
629
+
630
+ Visit [lattifai.com/roadmap](https://lattifai.com/roadmap) for updates.
631
+
632
+ | Date | Release | Features |
633
+ |------|---------|----------|
634
+ | **Oct 2025** | Lattice-1-Alpha | ✅ English forced alignment, multi-format support |
635
+ | **Nov 2025** | Lattice-1 | ✅ EN+ZH+DE, speaker diarization, multi-model transcription |
636
+ | **Q1 2026** | Lattice-2 | ✅ Streaming mode, 🔮 40+ languages, real-time alignment |
637
+
638
+ ---
639
+
640
+ ## Development
641
+
642
+ ```bash
643
+ git clone https://github.com/lattifai/lattifai-python.git
644
+ cd lattifai-python
645
+
646
+ # Using uv (recommended)
647
+ uv sync && source .venv/bin/activate
648
+
649
+ # Or pip
650
+ pip install -e ".[all,dev]"
651
+
652
+ # Run tests
653
+ pytest
654
+
655
+ # Install pre-commit hooks
656
+ pre-commit install
657
+ ```
658
+
659
+ ## Contributing
660
+
661
+ 1. Fork the repository
662
+ 2. Create a feature branch (`git checkout -b feature/amazing-feature`)
663
+ 3. Make changes and add tests
664
+ 4. Run `pytest` and `pre-commit run --all-files`
665
+ 5. Commit your changes (`git commit -m 'Add amazing feature'`)
666
+ 6. Push to branch (`git push origin feature/amazing-feature`)
667
+ 7. Open a Pull Request
668
+
669
+ ---
670
+
671
+ ## Support
672
+
673
+ - **Issues**: [GitHub Issues](https://github.com/lattifai/lattifai-python/issues)
674
+ - **Discord**: [Join our community](https://discord.gg/kvF4WsBRK8)
675
+
676
+ ## License
677
+
678
+ Apache License 2.0