lattifai 0.4.5__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. lattifai/__init__.py +61 -47
  2. lattifai/alignment/__init__.py +6 -0
  3. lattifai/alignment/lattice1_aligner.py +119 -0
  4. lattifai/alignment/lattice1_worker.py +185 -0
  5. lattifai/{tokenizer → alignment}/phonemizer.py +4 -4
  6. lattifai/alignment/segmenter.py +166 -0
  7. lattifai/{tokenizer → alignment}/tokenizer.py +244 -169
  8. lattifai/audio2.py +211 -0
  9. lattifai/caption/__init__.py +20 -0
  10. lattifai/caption/caption.py +1275 -0
  11. lattifai/{io → caption}/gemini_reader.py +30 -30
  12. lattifai/{io → caption}/gemini_writer.py +17 -17
  13. lattifai/{io → caption}/supervision.py +4 -3
  14. lattifai/caption/text_parser.py +145 -0
  15. lattifai/cli/__init__.py +17 -0
  16. lattifai/cli/alignment.py +153 -0
  17. lattifai/cli/caption.py +204 -0
  18. lattifai/cli/server.py +19 -0
  19. lattifai/cli/transcribe.py +197 -0
  20. lattifai/cli/youtube.py +128 -0
  21. lattifai/client.py +460 -251
  22. lattifai/config/__init__.py +20 -0
  23. lattifai/config/alignment.py +73 -0
  24. lattifai/config/caption.py +178 -0
  25. lattifai/config/client.py +46 -0
  26. lattifai/config/diarization.py +67 -0
  27. lattifai/config/media.py +335 -0
  28. lattifai/config/transcription.py +84 -0
  29. lattifai/diarization/__init__.py +5 -0
  30. lattifai/diarization/lattifai.py +89 -0
  31. lattifai/errors.py +98 -91
  32. lattifai/logging.py +116 -0
  33. lattifai/mixin.py +552 -0
  34. lattifai/server/app.py +420 -0
  35. lattifai/transcription/__init__.py +76 -0
  36. lattifai/transcription/base.py +108 -0
  37. lattifai/transcription/gemini.py +219 -0
  38. lattifai/transcription/lattifai.py +103 -0
  39. lattifai/{workflows → transcription}/prompts/__init__.py +4 -4
  40. lattifai/types.py +30 -0
  41. lattifai/utils.py +16 -44
  42. lattifai/workflow/__init__.py +22 -0
  43. lattifai/workflow/agents.py +6 -0
  44. lattifai/{workflows → workflow}/base.py +22 -22
  45. lattifai/{workflows → workflow}/file_manager.py +239 -215
  46. lattifai/workflow/youtube.py +564 -0
  47. lattifai-1.0.0.dist-info/METADATA +736 -0
  48. lattifai-1.0.0.dist-info/RECORD +52 -0
  49. {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/WHEEL +1 -1
  50. lattifai-1.0.0.dist-info/entry_points.txt +13 -0
  51. {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/licenses/LICENSE +1 -1
  52. lattifai/base_client.py +0 -126
  53. lattifai/bin/__init__.py +0 -3
  54. lattifai/bin/agent.py +0 -325
  55. lattifai/bin/align.py +0 -296
  56. lattifai/bin/cli_base.py +0 -25
  57. lattifai/bin/subtitle.py +0 -210
  58. lattifai/io/__init__.py +0 -42
  59. lattifai/io/reader.py +0 -85
  60. lattifai/io/text_parser.py +0 -75
  61. lattifai/io/utils.py +0 -15
  62. lattifai/io/writer.py +0 -90
  63. lattifai/tokenizer/__init__.py +0 -3
  64. lattifai/workers/__init__.py +0 -3
  65. lattifai/workers/lattice1_alpha.py +0 -284
  66. lattifai/workflows/__init__.py +0 -34
  67. lattifai/workflows/agents.py +0 -10
  68. lattifai/workflows/gemini.py +0 -167
  69. lattifai/workflows/prompts/README.md +0 -22
  70. lattifai/workflows/prompts/gemini/README.md +0 -24
  71. lattifai/workflows/prompts/gemini/transcription_gem.txt +0 -81
  72. lattifai/workflows/youtube.py +0 -931
  73. lattifai-0.4.5.dist-info/METADATA +0 -808
  74. lattifai-0.4.5.dist-info/RECORD +0 -39
  75. lattifai-0.4.5.dist-info/entry_points.txt +0 -3
  76. {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,736 @@
1
+ Metadata-Version: 2.4
2
+ Name: lattifai
3
+ Version: 1.0.0
4
+ Summary: Lattifai Python SDK: Seamless Integration with Lattifai's Speech and Video AI Services
5
+ Author-email: Lattifai Technologies <tech@lattifai.com>
6
+ Maintainer-email: Lattice <tech@lattifai.com>
7
+ License: MIT License
8
+
9
+ Copyright (c) 2025 Lattifai.
10
+
11
+ Permission is hereby granted, free of charge, to any person obtaining a copy
12
+ of this software and associated documentation files (the "Software"), to deal
13
+ in the Software without restriction, including without limitation the rights
14
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15
+ copies of the Software, and to permit persons to whom the Software is
16
+ furnished to do so, subject to the following conditions:
17
+
18
+ The above copyright notice and this permission notice shall be included in all
19
+ copies or substantial portions of the Software.
20
+
21
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27
+ SOFTWARE.
28
+
29
+ Project-URL: Homepage, https://github.com/lattifai/lattifai-python
30
+ Project-URL: Documentation, https://github.com/lattifai/lattifai-python/README.md
31
+ Project-URL: Bug Tracker, https://github.com/lattifai/lattifai-python/issues
32
+ Project-URL: Discussions, https://github.com/lattifai/lattifai-python/discussions
33
+ Project-URL: Changelog, https://github.com/lattifai/lattifai-python/CHANGELOG.md
34
+ Keywords: lattifai,speech recognition,video analysis,ai,sdk,api client
35
+ Classifier: Development Status :: 5 - Production/Stable
36
+ Classifier: Intended Audience :: Developers
37
+ Classifier: Intended Audience :: Science/Research
38
+ Classifier: License :: OSI Approved :: Apache Software License
39
+ Classifier: Programming Language :: Python :: 3.10
40
+ Classifier: Programming Language :: Python :: 3.11
41
+ Classifier: Programming Language :: Python :: 3.12
42
+ Classifier: Programming Language :: Python :: 3.13
43
+ Classifier: Programming Language :: Python :: 3.14
44
+ Classifier: Operating System :: MacOS :: MacOS X
45
+ Classifier: Operating System :: POSIX :: Linux
46
+ Classifier: Operating System :: Microsoft :: Windows
47
+ Classifier: Topic :: Multimedia :: Sound/Audio
48
+ Classifier: Topic :: Multimedia :: Video
49
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
50
+ Requires-Python: <3.15,>=3.10
51
+ Description-Content-Type: text/markdown
52
+ License-File: LICENSE
53
+ Requires-Dist: lattifai-core>=0.4.5
54
+ Requires-Dist: lattifai-run>=1.0.1
55
+ Requires-Dist: python-dotenv
56
+ Requires-Dist: lhotse>=1.26.0
57
+ Requires-Dist: colorful>=0.5.6
58
+ Requires-Dist: pysubs2
59
+ Requires-Dist: praatio
60
+ Requires-Dist: tgt
61
+ Requires-Dist: onnx>=1.16.0
62
+ Requires-Dist: onnxruntime
63
+ Requires-Dist: msgpack
64
+ Requires-Dist: g2p-phonemizer>=0.4.0
65
+ Requires-Dist: av
66
+ Requires-Dist: wtpsplit>=2.1.6
67
+ Requires-Dist: OmniSenseVoice>=0.4.0
68
+ Requires-Dist: nemo_toolkit_asr[asr]>=2.7.0rc1
69
+ Requires-Dist: pyannote-audio-notorchdeps>=4.0.2
70
+ Requires-Dist: questionary>=2.0
71
+ Requires-Dist: yt-dlp
72
+ Requires-Dist: pycryptodome
73
+ Requires-Dist: google-genai>=1.22.0
74
+ Requires-Dist: fastapi>=0.111.0
75
+ Requires-Dist: uvicorn>=0.30.0
76
+ Requires-Dist: python-multipart>=0.0.9
77
+ Requires-Dist: jinja2>=3.1.4
78
+ Provides-Extra: numpy
79
+ Requires-Dist: numpy; extra == "numpy"
80
+ Provides-Extra: diarization
81
+ Requires-Dist: torch-audiomentations==0.12.0; extra == "diarization"
82
+ Requires-Dist: pyannote.audio>=4.0.2; extra == "diarization"
83
+ Provides-Extra: transcription
84
+ Requires-Dist: OmniSenseVoice>=0.4.0; extra == "transcription"
85
+ Requires-Dist: nemo_toolkit_asr[asr]>=2.7.0rc1; extra == "transcription"
86
+ Provides-Extra: test
87
+ Requires-Dist: pytest; extra == "test"
88
+ Requires-Dist: pytest-cov; extra == "test"
89
+ Requires-Dist: pytest-asyncio; extra == "test"
90
+ Requires-Dist: numpy; extra == "test"
91
+ Provides-Extra: all
92
+ Requires-Dist: numpy; extra == "all"
93
+ Requires-Dist: pytest; extra == "all"
94
+ Requires-Dist: pytest-cov; extra == "all"
95
+ Requires-Dist: pytest-asyncio; extra == "all"
96
+ Requires-Dist: pyannote.audio>=4.0.2; extra == "all"
97
+ Dynamic: license-file
98
+
99
+ <div align="center">
100
+ <img src="https://raw.githubusercontent.com/lattifai/lattifai-python/main/assets/logo.png" width=256>
101
+
102
+ [![PyPI version](https://badge.fury.io/py/lattifai.svg)](https://badge.fury.io/py/lattifai)
103
+ [![Python Versions](https://img.shields.io/pypi/pyversions/lattifai.svg)](https://pypi.org/project/lattifai)
104
+ [![PyPI Status](https://pepy.tech/badge/lattifai)](https://pepy.tech/project/lattifai)
105
+ </div>
106
+
107
+ <p align="center">
108
+ 🌐 <a href="https://lattifai.com"><b>Official Website</b></a> &nbsp&nbsp | &nbsp&nbsp 🖥️ <a href="https://github.com/lattifai/lattifai-python">GitHub</a> &nbsp&nbsp | &nbsp&nbsp 🤗 <a href="https://huggingface.co/Lattifai/Lattice-1">Model</a> &nbsp&nbsp | &nbsp&nbsp 📑 <a href="https://lattifai.com/blogs">Blog</a> &nbsp&nbsp | &nbsp&nbsp <a href="https://discord.gg/kvF4WsBRK8"><img src="https://img.shields.io/badge/Discord-Join-5865F2?logo=discord&logoColor=white" alt="Discord" style="vertical-align: middle;"></a>
109
+ </p>
110
+
111
+
112
+ # LattifAI: Precision Alignment, Infinite Possibilities
113
+
114
+ Advanced forced alignment and subtitle generation powered by [ 🤗 Lattice-1](https://huggingface.co/Lattifai/Lattice-1) model.
115
+
116
+ > **⚠️ Note on Current Limitations**:
117
+ > 1. **Memory Usage**: We are aware of high memory consumption and are actively working on further optimizations.
118
+ > 2. **Long Audio**: Currently, long audio files might face issues. Support for **long-form audio (up to 20 hours)** will be available shortly.
119
+
120
+ ## Table of Contents
121
+
122
+ - [Installation](#installation)
123
+ - [Quick Start](#quick-start)
124
+ - [Command Line Interface](#command-line-interface)
125
+ - [Python SDK (5 Lines of Code)](#python-sdk-5-lines-of-code)
126
+ - [Web Interface](#web-interface)
127
+ - [CLI Reference](#cli-reference)
128
+ - [lai alignment align](#lai-alignment-align)
129
+ - [lai alignment youtube](#lai-alignment-youtube)
130
+ - [lai transcribe run](#lai-transcribe-run)
131
+ - [lai caption convert](#lai-caption-convert)
132
+ - [lai caption shift](#lai-caption-shift)
133
+ - [Python SDK Reference](#python-sdk-reference)
134
+ - [Basic Alignment](#basic-alignment)
135
+ - [YouTube Processing](#youtube-processing)
136
+ - [Configuration Objects](#configuration-objects)
137
+ - [Advanced Features](#advanced-features)
138
+ - [Word-Level Alignment](#word-level-alignment)
139
+ - [Smart Sentence Splitting](#smart-sentence-splitting)
140
+ - [Speaker Diarization](#speaker-diarization-wip)
141
+ - [YAML Configuration Files](#yaml-configuration-files)
142
+ - [Supported Formats](#supported-formats)
143
+ - [Roadmap](#roadmap)
144
+ - [Development](#development)
145
+
146
+ ---
147
+
148
+ ## Installation
149
+
150
+ ### Step 1: Install SDK
151
+
152
+ **Using pip:**
153
+ ```bash
154
+
155
+ pip install install-k2
156
+ install-k2 --torch-version 2.9.1 # if not set will auto-detect PyTorch version and install compatible k2
157
+
158
+ pip install lattifai==1.0.0
159
+ ```
160
+
161
+ **Using uv (Recommended - 10-100x faster):**
162
+ ```bash
163
+ # Install uv if you haven't already
164
+ curl -LsSf https://astral.sh/uv/install.sh | sh
165
+
166
+ # Create a new project with uv
167
+ uv init my-project
168
+ cd my-project
169
+ source .venv/bin/activate
170
+
171
+ # Install k2 (required dependency)
172
+ uv pip install install-k2
173
+ uv pip install pip
174
+ uv run install-k2 --torch-version 2.9.1
175
+
176
+ # Install LattifAI (v1.0.0)
177
+ uv pip install lattifai==1.0.0
178
+ ```
179
+
180
+ > **Note**: `install-k2` automatically detects your PyTorch version (up to 2.9) and installs the compatible k2 wheel.
181
+
182
+ <details>
183
+ <summary><b>install-k2 options</b></summary>
184
+
185
+ ```
186
+ usage: install-k2 [-h] [--system {linux,darwin,windows}] [--dry-run] [--torch-version TORCH_VERSION]
187
+
188
+ optional arguments:
189
+ -h, help Show this help message and exit
190
+ --system {linux,darwin,windows} Override OS detection
191
+ --dry-run Show what would be installed without making changes
192
+ --torch-version TORCH_VERSION Specify torch version (e.g., 2.8.0)
193
+ ```
194
+ </details>
195
+
196
+ ### Step 2: Get Your API Key
197
+
198
+ **LattifAI API Key (Required)**
199
+
200
+ Get your **free API key** at [https://lattifai.com/dashboard/api-keys](https://lattifai.com/dashboard/api-keys)
201
+
202
+ **Option A: Environment variable (recommended)**
203
+ ```bash
204
+ export LATTIFAI_API_KEY="lf_your_api_key_here"
205
+ ```
206
+
207
+ **Option B: `.env` file**
208
+ ```bash
209
+ # .env
210
+ LATTIFAI_API_KEY=lf_your_api_key_here
211
+ ```
212
+
213
+ **Gemini API Key (Optional - for transcription)**
214
+
215
+ If you want to use Gemini models for transcription (e.g., `gemini-2.5-pro`), get your **free Gemini API key** at [https://aistudio.google.com/apikey](https://aistudio.google.com/apikey)
216
+
217
+ ```bash
218
+ # Add to environment variable
219
+ export GEMINI_API_KEY="your_gemini_api_key_here"
220
+
221
+ # Or add to .env file
222
+ GEMINI_API_KEY=your_gemini_api_key_here # AIzaSyxxxx
223
+ ```
224
+
225
+ > **Note**: Gemini API key is only required if you use Gemini models for transcription. It's not needed for alignment or when using other transcription models.
226
+
227
+ ---
228
+
229
+ ## Quick Start
230
+
231
+ ### Command Line Interface
232
+
233
+ ![CLI Demo](assets/cli.png)
234
+
235
+ ```bash
236
+ # Align local audio with subtitle
237
+ lai alignment align audio.wav subtitle.srt output.srt
238
+
239
+ # Download and align YouTube video
240
+ lai alignment youtube "https://youtube.com/watch?v=VIDEO_ID"
241
+ ```
242
+
243
+ ### Python SDK (5 Lines of Code)
244
+
245
+ ```python
246
+ from lattifai import LattifAI
247
+
248
+ client = LattifAI()
249
+ caption = client.alignment(
250
+ input_media="audio.wav",
251
+ input_caption="subtitle.srt",
252
+ output_caption_path="aligned.srt",
253
+ )
254
+ ```
255
+
256
+ That's it! Your aligned subtitles are saved to `aligned.srt`.
257
+
258
+ ### Web Interface
259
+
260
+ 1. **Start the backend server:**
261
+ ```bash
262
+ lai-server
263
+ ```
264
+
265
+ 2. **Start the frontend (in a new terminal):**
266
+ ```bash
267
+ cd app
268
+ npm install
269
+ npm run dev
270
+ ```
271
+
272
+ Visit `http://localhost:5173` to open the web interface.
273
+
274
+ ---
275
+
276
+ ## CLI Reference
277
+
278
+ ### Command Overview
279
+
280
+ | Command | Description |
281
+ |---------|-------------|
282
+ | `lai alignment align` | Align local audio/video with caption |
283
+ | `lai alignment youtube` | Download & align YouTube content |
284
+ | `lai transcribe run` | Transcribe audio/video or YouTube URL to caption |
285
+ | `lai transcribe align` | Transcribe audio/video and align with generated transcript |
286
+ | `lai caption convert` | Convert between caption formats |
287
+ | `lai caption normalize` | Clean and normalize caption text |
288
+ | `lai caption shift` | Shift caption timestamps |
289
+
290
+
291
+ ### lai alignment align
292
+
293
+ ```bash
294
+ # Basic usage
295
+ lai alignment align <audio> <caption> <output>
296
+
297
+ # Examples
298
+ lai alignment align audio.wav caption.srt output.srt
299
+ lai alignment align video.mp4 caption.vtt output.srt alignment.device=cuda
300
+ lai alignment align audio.wav caption.srt output.json \
301
+ caption.split_sentence=true \
302
+ caption.word_level=true
303
+ ```
304
+
305
+ ### lai alignment youtube
306
+
307
+ ```bash
308
+ # Basic usage
309
+ lai alignment youtube <url>
310
+
311
+ # Examples
312
+ lai alignment youtube "https://youtube.com/watch?v=VIDEO_ID"
313
+ lai alignment youtube "https://youtube.com/watch?v=VIDEO_ID" \
314
+ media.output_dir=~/Downloads \
315
+ caption.output_path=aligned.srt \
316
+ caption.split_sentence=true
317
+ ```
318
+
319
+ ### lai transcribe run
320
+
321
+ Perform automatic speech recognition (ASR) on audio/video files or YouTube URLs to generate timestamped transcriptions.
322
+
323
+ ```bash
324
+ # Basic usage - local file
325
+ lai transcribe run <input> <output>
326
+
327
+ # Basic usage - YouTube URL
328
+ lai transcribe run <url> <output_dir>
329
+
330
+ # Examples - Local files
331
+ lai transcribe run audio.wav output.srt
332
+ lai transcribe run audio.mp4 output.ass \
333
+ transcription.model_name=nvidia/parakeet-tdt-0.6b-v3
334
+
335
+ # Examples - YouTube URLs
336
+ lai transcribe run "https://youtube.com/watch?v=VIDEO_ID" output_dir=./output
337
+ lai transcribe run "https://youtube.com/watch?v=VIDEO_ID" output.ass output_dir=./output \
338
+ transcription.model_name=gemini-2.5-pro \
339
+ transcription.gemini_api_key=YOUR_GEMINI_API_KEY
340
+
341
+ # Full configuration with keyword arguments
342
+ lai transcribe run \
343
+ input=audio.wav \
344
+ output_caption=output.srt \
345
+ channel_selector=average \
346
+ transcription.device=cuda \
347
+ transcription.model_name=iic/SenseVoiceSmall
348
+ ```
349
+
350
+ **Parameters:**
351
+ - `input`: Path to audio/video file or YouTube URL (required)
352
+ - `output_caption`: Path for output caption file (for local files)
353
+ - `output_dir`: Directory for output files (for YouTube URLs, defaults to current directory)
354
+ - `media_format`: Media format for YouTube downloads (default: mp3)
355
+ - `channel_selector`: Audio channel selection - "average", "left", "right", or channel index (default: "average")
356
+ - Note: Ignored when transcribing YouTube URLs with Gemini models
357
+ - `transcription`: Transcription configuration (model_name, device, language, gemini_api_key)
358
+
359
+ **Supported Transcription Models (More Coming Soon):**
360
+ - `gemini-2.5-pro` - Google Gemini API (requires API key)
361
+ - Languages: 100+ languages including English, Chinese, Spanish, French, German, Japanese, Korean, Arabic, and more
362
+ - `gemini-3-pro-preview` - Google Gemini API (requires API key)
363
+ - Languages: 100+ languages (same as gemini-2.5-pro)
364
+ - `nvidia/parakeet-tdt-0.6b-v3` - NVIDIA Parakeet model
365
+ - Languages: Bulgarian (bg), Croatian (hr), Czech (cs), Danish (da), Dutch (nl), English (en), Estonian (et), Finnish (fi), French (fr), German (de), Greek (el), Hungarian (hu), Italian (it), Latvian (lv), Lithuanian (lt), Maltese (mt), Polish (pl), Portuguese (pt), Romanian (ro), Slovak (sk), Slovenian (sl), Spanish (es), Swedish (sv), Russian (ru), Ukrainian (uk)
366
+ - `iic/SenseVoiceSmall` - Alibaba SenseVoice model
367
+ - Languages: Chinese/Mandarin (zh), English (en), Japanese (ja), Korean (ko), Cantonese (yue)
368
+ - More models will be integrated in future releases
369
+
370
+ **Note:** For transcription with alignment on local files, use `lai transcribe align` instead.
371
+
372
+ ### lai transcribe align
373
+
374
+ Transcribe audio/video file and automatically align the generated transcript with the audio.
375
+
376
+ This command combines transcription and alignment in a single step, producing precisely aligned captions.
377
+
378
+ ```bash
379
+ # Basic usage
380
+ lai transcribe align <input_media> <output_caption>
381
+
382
+ # Examples
383
+ lai transcribe align audio.wav output.srt
384
+ lai transcribe align audio.mp4 output.ass \
385
+ transcription.model_name=nvidia/parakeet-tdt-0.6b-v3 \
386
+ alignment.device=cuda
387
+
388
+ # Using Gemini transcription with alignment
389
+ lai transcribe align audio.wav output.srt \
390
+ transcription.model_name=gemini-2.5-pro \
391
+ transcription.gemini_api_key=YOUR_KEY \
392
+ caption.split_sentence=true
393
+
394
+ # Full configuration
395
+ lai transcribe align \
396
+ input_media=audio.wav \
397
+ output_caption=output.srt \
398
+ transcription.device=mps \
399
+ transcription.model_name=iic/SenseVoiceSmall \
400
+ alignment.device=cuda \
401
+ caption.word_level=true
402
+ ```
403
+
404
+ **Parameters:**
405
+ - `input_media`: Path to input audio/video file (required)
406
+ - `output_caption`: Path for output aligned caption file (required)
407
+ - `transcription`: Transcription configuration (model_name, device, language, gemini_api_key)
408
+ - `alignment`: Alignment configuration (model_name, device)
409
+ - `caption`: Caption formatting options (split_sentence, word_level, etc.)
410
+
411
+
412
+ ### lai caption convert
413
+
414
+ ```bash
415
+ lai caption convert input.srt output.vtt
416
+ lai caption convert input.srt output.json normalize_text=true
417
+ ```
418
+
419
+ ### lai caption shift
420
+
421
+ ```bash
422
+ lai caption shift input.srt output.srt 2.0 # Delay by 2 seconds
423
+ lai caption shift input.srt output.srt -1.5 # Advance by 1.5 seconds
424
+ ```
425
+
426
+ ---
427
+
428
+ ## Python SDK Reference
429
+
430
+ ### Basic Alignment
431
+
432
+ ```python
433
+ from lattifai import LattifAI
434
+
435
+ # Initialize client (uses LATTIFAI_API_KEY from environment)
436
+ client = LattifAI()
437
+
438
+ # Align audio/video with subtitle
439
+ caption = client.alignment(
440
+ input_media="audio.wav", # Audio or video file
441
+ input_caption="subtitle.srt", # Input subtitle file
442
+ output_caption_path="output.srt", # Output aligned subtitle
443
+ split_sentence=True, # Enable smart sentence splitting
444
+ )
445
+
446
+ # Access alignment results
447
+ for segment in caption.supervisions:
448
+ print(f"{segment.start:.2f}s - {segment.end:.2f}s: {segment.text}")
449
+ ```
450
+
451
+ ### YouTube Processing
452
+
453
+ ```python
454
+ from lattifai import LattifAI
455
+
456
+ client = LattifAI()
457
+
458
+ # Download YouTube video and align with auto-downloaded subtitles
459
+ caption = client.youtube(
460
+ url="https://youtube.com/watch?v=VIDEO_ID",
461
+ output_dir="./downloads",
462
+ output_caption_path="aligned.srt",
463
+ split_sentence=True,
464
+ )
465
+ ```
466
+
467
+
468
+ ### Configuration Objects
469
+
470
+ LattifAI uses a config-driven architecture for fine-grained control:
471
+
472
+ #### ClientConfig - API Settings
473
+
474
+ ```python
475
+ from lattifai import LattifAI, ClientConfig
476
+
477
+ client = LattifAI(
478
+ client_config=ClientConfig(
479
+ api_key="lf_your_api_key", # Or use LATTIFAI_API_KEY env var
480
+ timeout=30.0,
481
+ max_retries=3,
482
+ )
483
+ )
484
+ ```
485
+
486
+ #### AlignmentConfig - Model Settings
487
+
488
+ ```python
489
+ from lattifai import LattifAI, AlignmentConfig
490
+
491
+ client = LattifAI(
492
+ alignment_config=AlignmentConfig(
493
+ model_name="Lattifai/Lattice-1",
494
+ device="cuda", # "cpu", "cuda", "cuda:0", "mps"
495
+ )
496
+ )
497
+ ```
498
+
499
+ #### CaptionConfig - Subtitle Settings
500
+
501
+ ```python
502
+ from lattifai import LattifAI, CaptionConfig
503
+
504
+ client = LattifAI(
505
+ caption_config=CaptionConfig(
506
+ split_sentence=True, # Smart sentence splitting
507
+ word_level=True, # Word-level timestamps
508
+ normalize_text=True, # Clean HTML entities
509
+ include_speaker_in_text=False, # Include speaker labels
510
+ )
511
+ )
512
+ ```
513
+
514
+ #### Complete Configuration Example
515
+
516
+ ```python
517
+ from lattifai import (
518
+ LattifAI,
519
+ ClientConfig,
520
+ AlignmentConfig,
521
+ CaptionConfig
522
+ )
523
+
524
+ client = LattifAI(
525
+ client_config=ClientConfig(
526
+ api_key="lf_your_api_key",
527
+ timeout=60.0,
528
+ ),
529
+ alignment_config=AlignmentConfig(
530
+ model_name="Lattifai/Lattice-1",
531
+ device="cuda",
532
+ ),
533
+ caption_config=CaptionConfig(
534
+ split_sentence=True,
535
+ word_level=True,
536
+ output_format="json",
537
+ ),
538
+ )
539
+
540
+ caption = client.alignment(
541
+ input_media="audio.wav",
542
+ input_caption="subtitle.srt",
543
+ output_caption_path="output.json",
544
+ )
545
+ ```
546
+
547
+ ### Available Exports
548
+
549
+ ```python
550
+ from lattifai import (
551
+ # Client classes
552
+ LattifAI,
553
+ # AsyncLattifAI, # For async support
554
+
555
+ # Config classes
556
+ ClientConfig,
557
+ AlignmentConfig,
558
+ CaptionConfig,
559
+ DiarizationConfig,
560
+ MediaConfig,
561
+
562
+ # I/O classes
563
+ Caption,
564
+ )
565
+ ```
566
+
567
+ ---
568
+
569
+ ## Advanced Features
570
+
571
+ ### Word-Level Alignment
572
+
573
+ Enable `word_level=True` to get precise timestamps for each word:
574
+
575
+ ```python
576
+ from lattifai import LattifAI, CaptionConfig
577
+
578
+ client = LattifAI(
579
+ caption_config=CaptionConfig(word_level=True)
580
+ )
581
+
582
+ caption = client.alignment(
583
+ input_media="audio.wav",
584
+ input_caption="subtitle.srt",
585
+ output_caption_path="output.json", # JSON preserves word-level data
586
+ )
587
+
588
+ # Access word-level alignments
589
+ for segment in caption.alignments:
590
+ if segment.alignment and "word" in segment.alignment:
591
+ for word_item in segment.alignment["word"]:
592
+ print(f"{word_item.start:.2f}s: {word_item.symbol} (confidence: {word_item.score:.2f})")
593
+ ```
594
+
595
+ ### Smart Sentence Splitting
596
+
597
+ The `split_sentence` option intelligently separates:
598
+ - Non-speech elements (`[APPLAUSE]`, `[MUSIC]`) from dialogue
599
+ - Multiple sentences within a single subtitle
600
+ - Speaker labels from content
601
+
602
+ ```python
603
+ caption = client.alignment(
604
+ input_media="audio.wav",
605
+ input_caption="subtitle.srt",
606
+ split_sentence=True,
607
+ )
608
+ ```
609
+
610
+ ### Speaker Diarization (WIP)
611
+
612
+ **Note:** This feature is currently under development and not yet fully available.
613
+
614
+ Speaker diarization automatically identifies and labels different speakers in audio. When enabled, the system will:
615
+ - Detect speaker changes in the audio
616
+ - Assign speaker labels (e.g., SPEAKER_00, SPEAKER_01) to each segment
617
+ - Update subtitle segments with speaker information
618
+
619
+ **Speaker Name Handling:**
620
+ - **Existing speaker labels in subtitles**: If your input captions already contain speaker names (e.g., `[Alice]`, `>> Bob:`, or `SPEAKER_01:`), the system will preserve them as much as possible during alignment
621
+ - **Gemini Transcriber**: When using Gemini models for transcription (e.g., `gemini-2.5-pro`), the model can intelligently identify and extract speaker names from dialogue context, making it easier to generate speaker-aware transcripts
622
+
623
+ **Python SDK:**
624
+ ```python
625
+ from lattifai import LattifAI, DiarizationConfig
626
+
627
+ client = LattifAI(
628
+ diarization_config=DiarizationConfig(enabled=True)
629
+ )
630
+
631
+ caption = client.alignment(
632
+ input_media="audio.wav",
633
+ input_caption="subtitle.srt",
634
+ output_caption_path="output.srt",
635
+ )
636
+
637
+ # Access speaker information
638
+ for segment in caption.supervisions:
639
+ print(f"[{segment.speaker}] {segment.text}")
640
+ ```
641
+
642
+ ### YAML Configuration Files
643
+
644
+ Create reusable configuration files:
645
+
646
+ ```yaml
647
+ # config/alignment.yaml
648
+ model_name: "Lattifai/Lattice-1"
649
+ device: "cuda"
650
+ batch_size: 1
651
+ ```
652
+
653
+ ```bash
654
+ lai alignment align audio.wav subtitle.srt output.srt \
655
+ alignment=config/alignment.yaml
656
+ ```
657
+
658
+ ---
659
+
660
+ ## Supported Formats
661
+
662
+ LattifAI supports virtually all common media and subtitle formats:
663
+
664
+ | Type | Formats |
665
+ |------|---------|
666
+ | **Audio** | WAV, MP3, M4A, AAC, FLAC, OGG, OPUS, AIFF, and more |
667
+ | **Video** | MP4, MKV, MOV, WEBM, AVI, and more |
668
+ | **Caption/Subtitle Input** | SRT, VTT, ASS, SSA, SUB, SBV, TXT, Gemini, and more |
669
+ | **Caption/Subtitle Output** | All input formats + TextGrid (Praat) |
670
+
671
+ **Tabular Formats:**
672
+ - **TSV**: Tab-separated values with optional speaker column
673
+ - **CSV**: Comma-separated values with optional speaker column
674
+ - **AUD**: Audacity labels format with `[[speaker]]` notation
675
+
676
+ > **Note**: If a format is not listed above but commonly used, it's likely supported. Feel free to try it or reach out if you encounter any issues.
677
+
678
+ ---
679
+
680
+ ## Roadmap
681
+
682
+ Visit our [LattifAI roadmap](https://lattifai.com/roadmap) for the latest updates.
683
+
684
+ | Date | Release | Features |
685
+ |------|---------|----------|
686
+ | **Oct 2025** | **Lattice-1-Alpha** | ✅ English forced alignment<br>✅ Multi-format support<br>✅ CPU/GPU optimization |
687
+ | **Nov 2025** | **Lattice-1** | ✅ English + Chinese + German<br>✅ Mixed languages alignment<br>🚀 Integrate Speaker Diarization |
688
+
689
+ ---
690
+
691
+ ## Development
692
+
693
+ ### Setup
694
+
695
+ ```bash
696
+ git clone https://github.com/lattifai/lattifai-python.git
697
+ cd lattifai-python
698
+
699
+ # Using uv (recommended)
700
+ curl -LsSf https://astral.sh/uv/install.sh | sh
701
+ uv sync
702
+ source .venv/bin/activate
703
+
704
+ # Or using pip
705
+ pip install -e ".[test]"
706
+
707
+ pre-commit install
708
+ ```
709
+
710
+ ### Testing
711
+
712
+ ```bash
713
+ pytest # Run all tests
714
+ pytest --cov=src # With coverage
715
+ pytest tests/test_basic.py # Specific test
716
+ ```
717
+
718
+ ---
719
+
720
+ ## Contributing
721
+
722
+ 1. Fork the repository
723
+ 2. Create a feature branch
724
+ 3. Make changes and add tests
725
+ 4. Run `pytest` and `pre-commit run`
726
+ 5. Submit a pull request
727
+
728
+ ## License
729
+
730
+ Apache License 2.0
731
+
732
+ ## Support
733
+
734
+ - **Issues**: [GitHub Issues](https://github.com/lattifai/lattifai-python/issues)
735
+ - **Discussions**: [GitHub Discussions](https://github.com/lattifai/lattifai-python/discussions)
736
+ - **Discord**: [Join our community](https://discord.gg/kvF4WsBRK8)