lattifai 1.2.1__py3-none-any.whl → 1.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. lattifai/alignment/__init__.py +10 -1
  2. lattifai/alignment/lattice1_aligner.py +66 -58
  3. lattifai/alignment/punctuation.py +38 -0
  4. lattifai/alignment/sentence_splitter.py +152 -21
  5. lattifai/alignment/text_align.py +440 -0
  6. lattifai/alignment/tokenizer.py +82 -40
  7. lattifai/caption/__init__.py +82 -6
  8. lattifai/caption/caption.py +335 -1141
  9. lattifai/caption/formats/__init__.py +199 -0
  10. lattifai/caption/formats/base.py +211 -0
  11. lattifai/caption/{gemini_reader.py → formats/gemini.py} +320 -60
  12. lattifai/caption/formats/json.py +194 -0
  13. lattifai/caption/formats/lrc.py +309 -0
  14. lattifai/caption/formats/nle/__init__.py +9 -0
  15. lattifai/caption/formats/nle/audition.py +561 -0
  16. lattifai/caption/formats/nle/avid.py +423 -0
  17. lattifai/caption/formats/nle/fcpxml.py +549 -0
  18. lattifai/caption/formats/nle/premiere.py +589 -0
  19. lattifai/caption/formats/pysubs2.py +642 -0
  20. lattifai/caption/formats/sbv.py +147 -0
  21. lattifai/caption/formats/tabular.py +338 -0
  22. lattifai/caption/formats/textgrid.py +193 -0
  23. lattifai/caption/formats/ttml.py +652 -0
  24. lattifai/caption/formats/vtt.py +469 -0
  25. lattifai/caption/parsers/__init__.py +9 -0
  26. lattifai/caption/{text_parser.py → parsers/text_parser.py} +4 -2
  27. lattifai/caption/standardize.py +636 -0
  28. lattifai/caption/utils.py +474 -0
  29. lattifai/cli/__init__.py +2 -1
  30. lattifai/cli/caption.py +108 -1
  31. lattifai/cli/transcribe.py +1 -1
  32. lattifai/cli/youtube.py +4 -1
  33. lattifai/client.py +33 -113
  34. lattifai/config/__init__.py +11 -1
  35. lattifai/config/alignment.py +7 -0
  36. lattifai/config/caption.py +267 -23
  37. lattifai/config/media.py +20 -0
  38. lattifai/diarization/__init__.py +41 -1
  39. lattifai/mixin.py +27 -15
  40. lattifai/transcription/base.py +6 -1
  41. lattifai/transcription/lattifai.py +19 -54
  42. lattifai/utils.py +7 -13
  43. lattifai/workflow/__init__.py +28 -4
  44. lattifai/workflow/file_manager.py +2 -5
  45. lattifai/youtube/__init__.py +43 -0
  46. lattifai/youtube/client.py +1170 -0
  47. lattifai/youtube/types.py +23 -0
  48. lattifai-1.2.2.dist-info/METADATA +615 -0
  49. lattifai-1.2.2.dist-info/RECORD +76 -0
  50. {lattifai-1.2.1.dist-info → lattifai-1.2.2.dist-info}/entry_points.txt +1 -2
  51. lattifai/caption/gemini_writer.py +0 -173
  52. lattifai/cli/app_installer.py +0 -142
  53. lattifai/cli/server.py +0 -44
  54. lattifai/server/app.py +0 -427
  55. lattifai/workflow/youtube.py +0 -577
  56. lattifai-1.2.1.dist-info/METADATA +0 -1134
  57. lattifai-1.2.1.dist-info/RECORD +0 -58
  58. {lattifai-1.2.1.dist-info → lattifai-1.2.2.dist-info}/WHEEL +0 -0
  59. {lattifai-1.2.1.dist-info → lattifai-1.2.2.dist-info}/licenses/LICENSE +0 -0
  60. {lattifai-1.2.1.dist-info → lattifai-1.2.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,23 @@
1
+ from dataclasses import dataclass
2
+ from typing import Any, List, Optional
3
+
4
+
5
+ @dataclass
6
+ class VideoMetadata:
7
+ video_id: str
8
+ title: str
9
+ description: str
10
+ duration: float # seconds
11
+ thumbnail_url: str
12
+ channel_name: str
13
+ view_count: int
14
+ upload_date: Optional[str] = None
15
+
16
+
17
+ @dataclass
18
+ class CaptionTrack:
19
+ language_code: str
20
+ language_name: str
21
+ kind: str # 'manual' | 'asr'
22
+ ext: str # 'vtt', 'srv3' etc
23
+ url: Optional[str] = None
@@ -0,0 +1,615 @@
1
+ Metadata-Version: 2.4
2
+ Name: lattifai
3
+ Version: 1.2.2
4
+ Summary: Lattifai Python SDK: Seamless Integration with Lattifai's Speech and Video AI Services
5
+ Author-email: Lattifai Technologies <tech@lattifai.com>
6
+ Maintainer-email: Lattice <tech@lattifai.com>
7
+ License: MIT License
8
+
9
+ Copyright (c) 2025 LattifAI.
10
+
11
+ Permission is hereby granted, free of charge, to any person obtaining a copy
12
+ of this software and associated documentation files (the "Software"), to deal
13
+ in the Software without restriction, including without limitation the rights
14
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15
+ copies of the Software, and to permit persons to whom the Software is
16
+ furnished to do so, subject to the following conditions:
17
+
18
+ The above copyright notice and this permission notice shall be included in all
19
+ copies or substantial portions of the Software.
20
+
21
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27
+ SOFTWARE.
28
+
29
+ Project-URL: Homepage, https://github.com/lattifai/lattifai-python
30
+ Project-URL: Documentation, https://github.com/lattifai/lattifai-python/blob/main/README.md
31
+ Project-URL: Bug Tracker, https://github.com/lattifai/lattifai-python/issues
32
+ Project-URL: Discussions, https://github.com/lattifai/lattifai-python/discussions
33
+ Project-URL: Changelog, https://github.com/lattifai/lattifai-python/blob/main/CHANGELOG.md
34
+ Keywords: lattifai,speech recognition,video analysis,ai,sdk,api client
35
+ Classifier: Development Status :: 5 - Production/Stable
36
+ Classifier: Intended Audience :: Developers
37
+ Classifier: Intended Audience :: Science/Research
38
+ Classifier: License :: OSI Approved :: Apache Software License
39
+ Classifier: Programming Language :: Python :: 3.10
40
+ Classifier: Programming Language :: Python :: 3.11
41
+ Classifier: Programming Language :: Python :: 3.12
42
+ Classifier: Programming Language :: Python :: 3.13
43
+ Classifier: Programming Language :: Python :: 3.14
44
+ Classifier: Operating System :: MacOS :: MacOS X
45
+ Classifier: Operating System :: POSIX :: Linux
46
+ Classifier: Operating System :: Microsoft :: Windows
47
+ Classifier: Topic :: Multimedia :: Sound/Audio
48
+ Classifier: Topic :: Multimedia :: Video
49
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
50
+ Requires-Python: <3.15,>=3.10
51
+ Description-Content-Type: text/markdown
52
+ License-File: LICENSE
53
+ Requires-Dist: lattifai[core]
54
+ Requires-Dist: lattifai[alignment]
55
+ Requires-Dist: lattifai[transcription]
56
+ Requires-Dist: lattifai[workflow]
57
+ Requires-Dist: lattifai[server]
58
+ Provides-Extra: core
59
+ Requires-Dist: k2py>=0.2.1; extra == "core"
60
+ Requires-Dist: lattifai-core>=0.6.0; extra == "core"
61
+ Requires-Dist: lattifai-run>=1.0.1; extra == "core"
62
+ Requires-Dist: python-dotenv; extra == "core"
63
+ Requires-Dist: msgpack; extra == "core"
64
+ Requires-Dist: scipy!=1.16.3; extra == "core"
65
+ Requires-Dist: av; extra == "core"
66
+ Provides-Extra: alignment
67
+ Requires-Dist: lhotse>=1.26.0; extra == "alignment"
68
+ Requires-Dist: colorful>=0.5.6; extra == "alignment"
69
+ Requires-Dist: pysubs2; extra == "alignment"
70
+ Requires-Dist: praatio; extra == "alignment"
71
+ Requires-Dist: tgt; extra == "alignment"
72
+ Requires-Dist: onnx>=1.16.0; extra == "alignment"
73
+ Requires-Dist: onnxruntime; extra == "alignment"
74
+ Requires-Dist: g2p-phonemizer>=0.4.0; extra == "alignment"
75
+ Requires-Dist: wtpsplit>=2.1.7; extra == "alignment"
76
+ Requires-Dist: modelscope>=1.33.0; extra == "alignment"
77
+ Requires-Dist: error-align-fix>=0.1.4; extra == "alignment"
78
+ Provides-Extra: transcription
79
+ Requires-Dist: OmniSenseVoice>=0.4.2; extra == "transcription"
80
+ Requires-Dist: nemo_toolkit_asr[asr]>=2.7.0rc4; extra == "transcription"
81
+ Requires-Dist: google-genai>=1.22.0; extra == "transcription"
82
+ Requires-Dist: pyannote-audio-notorchdeps>=4.0.2; extra == "transcription"
83
+ Provides-Extra: workflow
84
+ Requires-Dist: questionary>=2.0; extra == "workflow"
85
+ Requires-Dist: yt-dlp; extra == "workflow"
86
+ Requires-Dist: pycryptodome; extra == "workflow"
87
+ Provides-Extra: server
88
+ Requires-Dist: fastapi>=0.111.0; extra == "server"
89
+ Requires-Dist: uvicorn>=0.30.0; extra == "server"
90
+ Requires-Dist: python-multipart>=0.0.9; extra == "server"
91
+ Requires-Dist: jinja2>=3.1.4; extra == "server"
92
+ Provides-Extra: dev
93
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
94
+ Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
95
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
96
+ Dynamic: license-file
97
+
98
+ <div align="center">
99
+ <img src="https://raw.githubusercontent.com/lattifai/lattifai-python/main/assets/logo.png" width=256>
100
+
101
+ [![PyPI version](https://badge.fury.io/py/lattifai.svg)](https://badge.fury.io/py/lattifai)
102
+ [![Python Versions](https://img.shields.io/pypi/pyversions/lattifai.svg)](https://pypi.org/project/lattifai)
103
+ [![PyPI Status](https://pepy.tech/badge/lattifai)](https://pepy.tech/project/lattifai)
104
+ </div>
105
+
106
+ <p align="center">
107
+ 🌐 <a href="https://lattifai.com"><b>Official Website</b></a> &nbsp&nbsp | &nbsp&nbsp 🖥️ <a href="https://github.com/lattifai/lattifai-python">GitHub</a> &nbsp&nbsp | &nbsp&nbsp 🤗 <a href="https://huggingface.co/Lattifai/Lattice-1">Model</a> &nbsp&nbsp | &nbsp&nbsp 📑 <a href="https://lattifai.com/blogs">Blog</a> &nbsp&nbsp | &nbsp&nbsp <a href="https://discord.gg/kvF4WsBRK8"><img src="https://img.shields.io/badge/Discord-Join-5865F2?logo=discord&logoColor=white" alt="Discord" style="vertical-align: middle;"></a>
108
+ </p>
109
+
110
+
111
+ # LattifAI: Precision Alignment, Infinite Possibilities
112
+
113
+ Advanced forced alignment and subtitle generation powered by [ 🤗 Lattice-1](https://huggingface.co/Lattifai/Lattice-1) model.
114
+
115
+ ## Table of Contents
116
+
117
+ - [Features](#features)
118
+ - [Installation](#installation)
119
+ - [Quick Start](#quick-start)
120
+ - [CLI Reference](#cli-reference)
121
+ - [Python SDK](#python-sdk)
122
+ - [Advanced Features](#advanced-features)
123
+ - [Supported Formats & Languages](#supported-formats--languages)
124
+ - [Roadmap](#roadmap)
125
+ - [Development](#development)
126
+
127
+ ---
128
+
129
+ ## Features
130
+
131
+ | Feature | Description |
132
+ |---------|-------------|
133
+ | **Forced Alignment** | Word-level and segment-level audio-text synchronization powered by [Lattice-1](https://huggingface.co/Lattifai/Lattice-1) |
134
+ | **Multi-Model Transcription** | Gemini (100+ languages), Parakeet (24 languages), SenseVoice (5 languages) |
135
+ | **Speaker Diarization** | Multi-speaker identification with label preservation |
136
+ | **Streaming Mode** | Process audio up to 20 hours with minimal memory |
137
+ | **Universal Format Support** | 30+ caption/subtitle formats |
138
+
139
+ ### Alignment Models
140
+
141
+ | Model | Languages | Description |
142
+ |-------|-----------|-------------|
143
+ | **Lattice-1** | English, Chinese, German | Production model with mixed-language alignment support |
144
+ | **Lattice-1-Alpha** | English | Initial release with English forced alignment |
145
+
146
+ **Model Hub**: Models can be downloaded from `huggingface` (default) or `modelscope` (recommended for users in China):
147
+
148
+ ```bash
149
+ # Use ModelScope (faster in China)
150
+ lai alignment align audio.wav caption.srt output.srt alignment.model_hub=modelscope
151
+ ```
152
+
153
+ ```python
154
+ from lattifai import LattifAI, AlignmentConfig
155
+
156
+ client = LattifAI(alignment_config=AlignmentConfig(model_hub="modelscope"))
157
+ ```
158
+
159
+ ---
160
+
161
+ ## Installation
162
+
163
+ ### Using uv (Recommended)
164
+
165
+ [uv](https://github.com/astral-sh/uv) is a fast Python package manager (10-100x faster than pip).
166
+
167
+ ```bash
168
+ # Install uv
169
+ curl -LsSf https://astral.sh/uv/install.sh | sh
170
+
171
+ # Quick start (run without installing)
172
+ uvx --from lattifai lai --help
173
+
174
+ # Or create a project
175
+ mkdir my-project && cd my-project
176
+ uv init --bare && uv add lattifai
177
+ uv run lai alignment align audio.wav caption.srt output.srt
178
+ ```
179
+
180
+ ### Using pip
181
+
182
+ ```bash
183
+ pip install lattifai
184
+ ```
185
+
186
+ ### API Keys
187
+
188
+ **LattifAI API Key (Required)** - Get your free key at [lattifai.com/dashboard/api-keys](https://lattifai.com/dashboard/api-keys)
189
+
190
+ ```bash
191
+ export LATTIFAI_API_KEY="lf_your_api_key_here"
192
+ ```
193
+
194
+ **Gemini API Key (Optional)** - For transcription with Gemini models, get key at [aistudio.google.com/apikey](https://aistudio.google.com/apikey)
195
+
196
+ ```bash
197
+ export GEMINI_API_KEY="your_gemini_api_key_here"
198
+ ```
199
+
200
+ Or use a `.env` file:
201
+ ```bash
202
+ LATTIFAI_API_KEY=lf_your_api_key_here
203
+ GEMINI_API_KEY=your_gemini_api_key_here
204
+ ```
205
+
206
+ ---
207
+
208
+ ## Quick Start
209
+
210
+ ### Command Line
211
+
212
+ ```bash
213
+ # Align audio with subtitle
214
+ lai alignment align audio.wav subtitle.srt output.srt
215
+
216
+ # YouTube video
217
+ lai alignment youtube "https://youtube.com/watch?v=VIDEO_ID"
218
+ ```
219
+
220
+ ### Python SDK
221
+
222
+ ```python
223
+ from lattifai import LattifAI
224
+
225
+ client = LattifAI()
226
+ caption = client.alignment(
227
+ input_media="audio.wav",
228
+ input_caption="subtitle.srt",
229
+ output_caption_path="aligned.srt",
230
+ )
231
+ ```
232
+
233
+ ---
234
+
235
+ ## CLI Reference
236
+
237
+ | Command | Description | Example |
238
+ |---------|-------------|---------|
239
+ | `lai alignment align` | Align audio/video with caption | `lai alignment align audio.wav caption.srt output.srt` |
240
+ | `lai alignment youtube` | Download & align YouTube | `lai alignment youtube "https://youtube.com/watch?v=ID"` |
241
+ | `lai transcribe run` | Transcribe audio/video | `lai transcribe run audio.wav output.srt` |
242
+ | `lai transcribe align` | Transcribe and align | `lai transcribe align audio.wav output.srt` |
243
+ | `lai caption convert` | Convert caption formats | `lai caption convert input.srt output.vtt` |
244
+ | `lai caption shift` | Shift timestamps | `lai caption shift input.srt output.srt 2.0` |
245
+
246
+ ### Common Options
247
+
248
+ ```bash
249
+ # Device selection
250
+ alignment.device=cuda # cuda, mps, cpu
251
+
252
+ # Caption options
253
+ caption.split_sentence=true # Smart sentence splitting
254
+ caption.word_level=true # Word-level timestamps
255
+
256
+ # Streaming for long audio
257
+ media.streaming_chunk_secs=600
258
+
259
+ # Channel selection
260
+ media.channel_selector=left # left, right, average, or index
261
+ ```
262
+
263
+ ### Transcription Models
264
+
265
+ ```bash
266
+ # Gemini (100+ languages, requires GEMINI_API_KEY)
267
+ transcription.model_name=gemini-2.5-pro
268
+
269
+ # Parakeet (24 European languages)
270
+ transcription.model_name=nvidia/parakeet-tdt-0.6b-v3
271
+
272
+ # SenseVoice (zh, en, ja, ko, yue)
273
+ transcription.model_name=iic/SenseVoiceSmall
274
+ ```
275
+
276
+ ### lai transcribe run
277
+
278
+ Transcribe audio/video files or YouTube URLs to generate timestamped captions.
279
+
280
+ ```bash
281
+ # Local file
282
+ lai transcribe run audio.wav output.srt
283
+
284
+ # YouTube URL
285
+ lai transcribe run "https://youtube.com/watch?v=VIDEO_ID" output_dir=./output
286
+
287
+ # With model selection
288
+ lai transcribe run audio.wav output.srt \
289
+ transcription.model_name=gemini-2.5-pro \
290
+ transcription.device=cuda
291
+ ```
292
+
293
+ **Parameters:**
294
+ - `input`: Path to audio/video file or YouTube URL
295
+ - `output_caption`: Output caption file path (for local files)
296
+ - `output_dir`: Output directory (for YouTube URLs, defaults to current directory)
297
+ - `channel_selector`: Audio channel - `average` (default), `left`, `right`, or channel index
298
+
299
+ ### lai transcribe align
300
+
301
+ Transcribe and align in a single step - produces precisely aligned captions.
302
+
303
+ ```bash
304
+ # Basic usage
305
+ lai transcribe align audio.wav output.srt
306
+
307
+ # With options
308
+ lai transcribe align audio.wav output.srt \
309
+ transcription.model_name=nvidia/parakeet-tdt-0.6b-v3 \
310
+ alignment.device=cuda \
311
+ caption.split_sentence=true \
312
+ caption.word_level=true
313
+ ```
314
+
315
+ ---
316
+
317
+ ## Python SDK
318
+
319
+ ### Configuration Objects
320
+
321
+ ```python
322
+ from lattifai import (
323
+ LattifAI,
324
+ ClientConfig,
325
+ AlignmentConfig,
326
+ CaptionConfig,
327
+ DiarizationConfig,
328
+ MediaConfig,
329
+ )
330
+
331
+ client = LattifAI(
332
+ client_config=ClientConfig(api_key="lf_xxx", timeout=60.0),
333
+ alignment_config=AlignmentConfig(device="cuda"),
334
+ caption_config=CaptionConfig(split_sentence=True, word_level=True),
335
+ )
336
+
337
+ caption = client.alignment(
338
+ input_media="audio.wav",
339
+ input_caption="subtitle.srt",
340
+ output_caption_path="output.json",
341
+ )
342
+
343
+ # Access results
344
+ for segment in caption.supervisions:
345
+ print(f"{segment.start:.2f}s - {segment.end:.2f}s: {segment.text}")
346
+ ```
347
+
348
+ ### YouTube Processing
349
+
350
+ ```python
351
+ caption = client.youtube(
352
+ url="https://youtube.com/watch?v=VIDEO_ID",
353
+ output_dir="./downloads",
354
+ output_caption_path="aligned.srt",
355
+ )
356
+ ```
357
+
358
+ ### CaptionConfig Options
359
+
360
+ | Option | Default | Description |
361
+ |--------|---------|-------------|
362
+ | `split_sentence` | `False` | Smart sentence splitting, separates non-speech elements |
363
+ | `word_level` | `False` | Include word-level timestamps in output |
364
+ | `normalize_text` | `True` | Clean HTML entities and special characters |
365
+ | `include_speaker_in_text` | `True` | Include speaker labels in text output |
366
+
367
+ ```python
368
+ from lattifai import LattifAI, CaptionConfig
369
+
370
+ client = LattifAI(
371
+ caption_config=CaptionConfig(
372
+ split_sentence=True,
373
+ word_level=True,
374
+ normalize_text=True,
375
+ include_speaker_in_text=False,
376
+ )
377
+ )
378
+ ```
379
+
380
+ ---
381
+
382
+ ## Advanced Features
383
+
384
+ ### Streaming Mode (Long Audio)
385
+
386
+ Process audio up to 20 hours with minimal memory:
387
+
388
+ ```python
389
+ caption = client.alignment(
390
+ input_media="long_audio.wav",
391
+ input_caption="subtitle.srt",
392
+ streaming_chunk_secs=600.0, # 10-minute chunks
393
+ )
394
+ ```
395
+
396
+ ### Word-Level Alignment
397
+
398
+ ```python
399
+ client = LattifAI(caption_config=CaptionConfig(word_level=True))
400
+ caption = client.alignment(
401
+ input_media="audio.wav",
402
+ input_caption="subtitle.srt",
403
+ output_caption_path="output.json", # JSON preserves word-level data
404
+ )
405
+ ```
406
+
407
+ ### Speaker Diarization
408
+
409
+ Automatically identify and label different speakers in audio.
410
+
411
+ **Capabilities:**
412
+ - **Multi-Speaker Detection**: Automatically detect speaker changes
413
+ - **Smart Labeling**: Assign labels (SPEAKER_00, SPEAKER_01, etc.)
414
+ - **Label Preservation**: Maintain existing speaker names from input captions
415
+ - **Gemini Integration**: Extract speaker names from transcription context
416
+
417
+ **Label Handling:**
418
+ - Without existing labels → Generic labels (SPEAKER_00, SPEAKER_01)
419
+ - With existing labels (`[Alice]`, `>> Bob:`, `SPEAKER_01:`) → Preserved during alignment
420
+ - Gemini transcription → Names extracted from context (e.g., "Hi, I'm Alice" → `Alice`)
421
+
422
+ ```python
423
+ from lattifai import LattifAI, DiarizationConfig
424
+
425
+ client = LattifAI(
426
+ diarization_config=DiarizationConfig(
427
+ enabled=True,
428
+ device="cuda",
429
+ min_speakers=2,
430
+ max_speakers=4,
431
+ )
432
+ )
433
+ caption = client.alignment(...)
434
+
435
+ for segment in caption.supervisions:
436
+ print(f"[{segment.speaker}] {segment.text}")
437
+ ```
438
+
439
+ **CLI:**
440
+ ```bash
441
+ lai alignment align audio.wav subtitle.srt output.srt \
442
+ diarization.enabled=true \
443
+ diarization.device=cuda
444
+ ```
445
+
446
+ ### Data Flow
447
+
448
+ ```
449
+ Input Media → AudioLoader → Aligner → (Diarizer) → Caption
450
+
451
+ Input Caption → Reader → Tokenizer
452
+ ```
453
+
454
+ ---
455
+
456
+ ## Supported Formats & Languages
457
+
458
+ ### Media Formats
459
+
460
+ | Type | Formats |
461
+ |------|---------|
462
+ | **Audio** | WAV, MP3, M4A, AAC, FLAC, OGG, OPUS, AIFF, and more |
463
+ | **Video** | MP4, MKV, MOV, WEBM, AVI, and more |
464
+ | **Caption** | SRT, VTT, ASS, SSA, JSON, TextGrid, TSV, CSV, LRC, TTML, and more |
465
+
466
+ ### JSON Format
467
+
468
+ JSON is the most flexible format for storing caption data with full word-level timing support:
469
+
470
+ ```json
471
+ [
472
+ {
473
+ "text": "Hello beautiful world",
474
+ "start": 0.0,
475
+ "end": 2.5,
476
+ "speaker": "Speaker 1",
477
+ "words": [
478
+ {"word": "Hello", "start": 0.0, "end": 0.5},
479
+ {"word": "beautiful", "start": 0.6, "end": 1.4},
480
+ {"word": "world", "start": 1.5, "end": 2.5}
481
+ ]
482
+ }
483
+ ]
484
+ ```
485
+
486
+ **Features:**
487
+ - Word-level timestamps preserved in `words` array
488
+ - Round-trip compatible (read/write without data loss)
489
+ - Optional `speaker` field for multi-speaker content
490
+
491
+ ### Word-Level and Karaoke Output
492
+
493
+ | Format | `word_level=True` | `word_level=True` + `karaoke=True` |
494
+ |--------|-------------------|-----------------------------------|
495
+ | **JSON** | Includes `words` array | Same as word_level=True |
496
+ | **SRT** | One word per segment | One word per segment |
497
+ | **VTT** | One word per segment | YouTube VTT style: `<00:00:00.000><c> word</c>` |
498
+ | **ASS** | One word per segment | `{\kf}` karaoke tags (sweep effect) |
499
+ | **LRC** | One word per line | Enhanced `<timestamp>` tags |
500
+ | **TTML** | One word per `<p>` element | `<span>` with `itunes:timing="Word"` |
501
+
502
+ ### VTT Format (YouTube VTT Support)
503
+
504
+ The VTT format handler supports both standard WebVTT and YouTube VTT with word-level timestamps.
505
+
506
+ **Reading**: VTT automatically detects YouTube VTT format (with `<timestamp><c>` tags) and extracts word-level alignment data:
507
+
508
+ ```
509
+ WEBVTT
510
+
511
+ 00:00:00.000 --> 00:00:02.000
512
+ <00:00:00.000><c> Hello</c><00:00:00.500><c> world</c>
513
+ ```
514
+
515
+ **Writing**: Use `word_level=True` with `karaoke_config` to output YouTube VTT style:
516
+
517
+ ```python
518
+ from lattifai import Caption
519
+ from lattifai.config.caption import KaraokeConfig
520
+
521
+ caption = Caption.read("input.vtt")
522
+ caption.write(
523
+ "output.vtt",
524
+ word_level=True,
525
+ karaoke_config=KaraokeConfig(enabled=True)
526
+ )
527
+ ```
528
+
529
+ ```bash
530
+ # CLI: Convert to YouTube VTT with word-level timestamps
531
+ lai caption convert input.json output.vtt \
532
+ caption.word_level=true \
533
+ caption.karaoke.enabled=true
534
+ ```
535
+
536
+ ### Transcription Language Support
537
+
538
+ #### Gemini Models (100+ Languages)
539
+
540
+ **Models**: `gemini-2.5-pro`, `gemini-3-pro-preview`, `gemini-3-flash-preview`
541
+
542
+ English, Chinese (Mandarin & Cantonese), Spanish, French, German, Italian, Portuguese, Japanese, Korean, Arabic, Russian, Hindi, Bengali, Turkish, Dutch, Polish, Swedish, Danish, Norwegian, Finnish, Greek, Hebrew, Thai, Vietnamese, Indonesian, Malay, Filipino, Ukrainian, Czech, Romanian, Hungarian, and 70+ more.
543
+
544
+ > Requires Gemini API key from [Google AI Studio](https://aistudio.google.com/apikey)
545
+
546
+ #### NVIDIA Parakeet (24 European Languages)
547
+
548
+ **Model**: `nvidia/parakeet-tdt-0.6b-v3`
549
+
550
+ | Region | Languages |
551
+ |--------|-----------|
552
+ | Western Europe | English (en), French (fr), German (de), Spanish (es), Italian (it), Portuguese (pt), Dutch (nl) |
553
+ | Nordic | Danish (da), Swedish (sv), Norwegian (no), Finnish (fi) |
554
+ | Eastern Europe | Polish (pl), Czech (cs), Slovak (sk), Hungarian (hu), Romanian (ro), Bulgarian (bg), Ukrainian (uk), Russian (ru) |
555
+ | Others | Croatian (hr), Estonian (et), Latvian (lv), Lithuanian (lt), Slovenian (sl), Maltese (mt), Greek (el) |
556
+
557
+ #### Alibaba SenseVoice (5 Asian Languages)
558
+
559
+ **Model**: `iic/SenseVoiceSmall`
560
+
561
+ Chinese/Mandarin (zh), English (en), Japanese (ja), Korean (ko), Cantonese (yue)
562
+
563
+ ---
564
+
565
+ ## Roadmap
566
+
567
+ Visit [lattifai.com/roadmap](https://lattifai.com/roadmap) for updates.
568
+
569
+ | Date | Release | Features |
570
+ |------|---------|----------|
571
+ | **Oct 2025** | Lattice-1-Alpha | ✅ English forced alignment, multi-format support |
572
+ | **Nov 2025** | Lattice-1 | ✅ EN+ZH+DE, speaker diarization, multi-model transcription |
573
+ | **Q1 2026** | Lattice-2 | ✅ Streaming mode, 🔮 40+ languages, real-time alignment |
574
+
575
+ ---
576
+
577
+ ## Development
578
+
579
+ ```bash
580
+ git clone https://github.com/lattifai/lattifai-python.git
581
+ cd lattifai-python
582
+
583
+ # Using uv (recommended)
584
+ uv sync && source .venv/bin/activate
585
+
586
+ # Or pip
587
+ pip install -e ".[test]"
588
+
589
+ # Run tests
590
+ pytest
591
+
592
+ # Install pre-commit hooks
593
+ pre-commit install
594
+ ```
595
+
596
+ ## Contributing
597
+
598
+ 1. Fork the repository
599
+ 2. Create a feature branch (`git checkout -b feature/amazing-feature`)
600
+ 3. Make changes and add tests
601
+ 4. Run `pytest` and `pre-commit run --all-files`
602
+ 5. Commit your changes (`git commit -m 'Add amazing feature'`)
603
+ 6. Push to branch (`git push origin feature/amazing-feature`)
604
+ 7. Open a Pull Request
605
+
606
+ ---
607
+
608
+ ## Support
609
+
610
+ - **Issues**: [GitHub Issues](https://github.com/lattifai/lattifai-python/issues)
611
+ - **Discord**: [Join our community](https://discord.gg/kvF4WsBRK8)
612
+
613
+ ## License
614
+
615
+ Apache License 2.0