lattifai 1.2.2__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/_init.py +20 -0
- lattifai/alignment/__init__.py +2 -3
- lattifai/alignment/lattice1_aligner.py +117 -4
- lattifai/alignment/lattice1_worker.py +47 -4
- lattifai/alignment/segmenter.py +3 -2
- lattifai/alignment/text_align.py +2 -1
- lattifai/alignment/tokenizer.py +56 -29
- lattifai/audio2.py +162 -183
- lattifai/cli/alignment.py +5 -0
- lattifai/cli/caption.py +6 -6
- lattifai/cli/transcribe.py +1 -5
- lattifai/cli/youtube.py +3 -0
- lattifai/client.py +41 -12
- lattifai/config/__init__.py +21 -3
- lattifai/config/alignment.py +7 -0
- lattifai/config/caption.py +13 -243
- lattifai/config/client.py +16 -0
- lattifai/config/event.py +102 -0
- lattifai/config/transcription.py +25 -1
- lattifai/data/__init__.py +8 -0
- lattifai/data/caption.py +228 -0
- lattifai/errors.py +78 -53
- lattifai/event/__init__.py +65 -0
- lattifai/event/lattifai.py +166 -0
- lattifai/mixin.py +22 -17
- lattifai/transcription/base.py +2 -1
- lattifai/transcription/gemini.py +147 -16
- lattifai/transcription/lattifai.py +8 -11
- lattifai/types.py +1 -1
- lattifai/youtube/client.py +143 -48
- {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/METADATA +117 -54
- lattifai-1.3.0.dist-info/RECORD +57 -0
- lattifai/__init__.py +0 -88
- lattifai/alignment/sentence_splitter.py +0 -350
- lattifai/caption/__init__.py +0 -96
- lattifai/caption/caption.py +0 -661
- lattifai/caption/formats/__init__.py +0 -199
- lattifai/caption/formats/base.py +0 -211
- lattifai/caption/formats/gemini.py +0 -722
- lattifai/caption/formats/json.py +0 -194
- lattifai/caption/formats/lrc.py +0 -309
- lattifai/caption/formats/nle/__init__.py +0 -9
- lattifai/caption/formats/nle/audition.py +0 -561
- lattifai/caption/formats/nle/avid.py +0 -423
- lattifai/caption/formats/nle/fcpxml.py +0 -549
- lattifai/caption/formats/nle/premiere.py +0 -589
- lattifai/caption/formats/pysubs2.py +0 -642
- lattifai/caption/formats/sbv.py +0 -147
- lattifai/caption/formats/tabular.py +0 -338
- lattifai/caption/formats/textgrid.py +0 -193
- lattifai/caption/formats/ttml.py +0 -652
- lattifai/caption/formats/vtt.py +0 -469
- lattifai/caption/parsers/__init__.py +0 -9
- lattifai/caption/parsers/text_parser.py +0 -147
- lattifai/caption/standardize.py +0 -636
- lattifai/caption/supervision.py +0 -34
- lattifai/caption/utils.py +0 -474
- lattifai-1.2.2.dist-info/RECORD +0 -76
- {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/WHEEL +0 -0
- {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/entry_points.txt +0 -0
- {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/top_level.txt +0 -0
lattifai/youtube/client.py
CHANGED
|
@@ -15,7 +15,8 @@ try:
|
|
|
15
15
|
except ImportError:
|
|
16
16
|
yt_dlp = None
|
|
17
17
|
|
|
18
|
-
from
|
|
18
|
+
from lattifai.caption.config import CAPTION_FORMATS
|
|
19
|
+
|
|
19
20
|
from ..errors import LattifAIError
|
|
20
21
|
from ..workflow.base import setup_workflow_logger
|
|
21
22
|
from ..workflow.file_manager import TRANSCRIBE_CHOICE, FileExistenceManager
|
|
@@ -49,6 +50,12 @@ class YoutubeLoader:
|
|
|
49
50
|
if yt_dlp is None:
|
|
50
51
|
raise ImportError("yt-dlp is required. Install with `pip install yt-dlp`")
|
|
51
52
|
|
|
53
|
+
# Auto-load from environment if not specified
|
|
54
|
+
if proxy is None:
|
|
55
|
+
proxy = os.getenv("YOUTUBE_PROXY")
|
|
56
|
+
if cookies is None:
|
|
57
|
+
cookies = os.getenv("YOUTUBE_COOKIE_FILE") or os.getenv("YOUTUBE_COOKIE_BROWSER")
|
|
58
|
+
|
|
52
59
|
self.proxy = proxy
|
|
53
60
|
self.cookies = cookies
|
|
54
61
|
|
|
@@ -64,14 +71,29 @@ class YoutubeLoader:
|
|
|
64
71
|
|
|
65
72
|
if self.proxy:
|
|
66
73
|
self._base_opts["proxy"] = self.proxy
|
|
74
|
+
logger.info(f"🌐 Using proxy: {self.proxy}")
|
|
67
75
|
|
|
76
|
+
# Cookie configuration
|
|
68
77
|
if self.cookies:
|
|
69
|
-
|
|
78
|
+
# Check if it's a browser name (chrome, firefox, safari, etc.)
|
|
79
|
+
browser_names = ["chrome", "firefox", "safari", "edge", "opera", "brave"]
|
|
80
|
+
if self.cookies.lower() in browser_names:
|
|
81
|
+
# Use cookies from browser directly
|
|
82
|
+
self._base_opts["cookiesfrombrowser"] = (self.cookies.lower(),)
|
|
83
|
+
logger.info(f"🍪 Using cookies from browser: {self.cookies}")
|
|
84
|
+
else:
|
|
85
|
+
# Use cookie file
|
|
86
|
+
cookie_path = Path(self.cookies).expanduser()
|
|
87
|
+
if cookie_path.exists():
|
|
88
|
+
self._base_opts["cookiefile"] = str(cookie_path)
|
|
89
|
+
logger.info(f"🍪 Using cookie file: {cookie_path}")
|
|
90
|
+
else:
|
|
91
|
+
logger.warning(f"⚠️ Cookie file not found: {cookie_path}")
|
|
92
|
+
logger.warning("💡 Tip: Run 'yt-dlp --cookies-from-browser chrome' to extract cookies")
|
|
70
93
|
|
|
71
|
-
#
|
|
72
|
-
#
|
|
73
|
-
#
|
|
74
|
-
self._base_opts["extractor_args"] = {"youtube": {"player_client": ["android", "web"]}}
|
|
94
|
+
# Note: player_client configuration is removed to avoid format availability issues
|
|
95
|
+
# with certain videos. Let yt-dlp automatically select the best client.
|
|
96
|
+
# Previous config caused "Requested format is not available" errors for some videos.
|
|
75
97
|
|
|
76
98
|
def get_video_info(self, video_id: str) -> Dict[str, Any]:
|
|
77
99
|
"""
|
|
@@ -136,8 +158,23 @@ class YoutubeLoader:
|
|
|
136
158
|
|
|
137
159
|
except yt_dlp.utils.DownloadError as e:
|
|
138
160
|
msg = str(e)
|
|
139
|
-
if "Sign in to confirm" in msg or "
|
|
140
|
-
|
|
161
|
+
if "Sign in to confirm" in msg or "not a bot" in msg:
|
|
162
|
+
# Bot detection error - provide helpful guidance
|
|
163
|
+
error_msg = (
|
|
164
|
+
f"🤖 YouTube Bot Detection: Video {video_id} requires authentication.\n\n"
|
|
165
|
+
"Solutions:\n"
|
|
166
|
+
"1. Use browser cookies (recommended):\n"
|
|
167
|
+
" loader = YoutubeLoader(cookies='chrome') # or 'firefox', 'safari'\n\n"
|
|
168
|
+
"2. Export cookie file:\n"
|
|
169
|
+
" yt-dlp --cookies-from-browser chrome --cookies cookies.txt <video_url>\n"
|
|
170
|
+
" loader = YoutubeLoader(cookies='cookies.txt')\n\n"
|
|
171
|
+
"3. Environment variable:\n"
|
|
172
|
+
" export YOUTUBE_COOKIE_BROWSER=chrome\n\n"
|
|
173
|
+
f"Original error: {msg}"
|
|
174
|
+
)
|
|
175
|
+
raise VideoUnavailableError(error_msg) from e
|
|
176
|
+
elif "Private video" in msg:
|
|
177
|
+
raise VideoUnavailableError(f"Video {video_id} is private") from e
|
|
141
178
|
raise YouTubeError(f"yt-dlp failed: {msg}") from e
|
|
142
179
|
except Exception as e:
|
|
143
180
|
raise YouTubeError(f"Unexpected error: {str(e)}") from e
|
|
@@ -192,8 +229,8 @@ class YoutubeLoader:
|
|
|
192
229
|
return "Unknown"
|
|
193
230
|
|
|
194
231
|
def _find_best_format(self, formats: List[Dict]) -> Optional[Dict]:
|
|
195
|
-
# Prefer json3, then vtt
|
|
196
|
-
priority = ["json3", "
|
|
232
|
+
# Prefer json3 (best precision), srv3 (word-level timing), then vtt
|
|
233
|
+
priority = ["json3", "srv3", "vtt", "ttml", "srv2", "srv1"]
|
|
197
234
|
|
|
198
235
|
for fmt_ext in priority:
|
|
199
236
|
for f in formats:
|
|
@@ -234,18 +271,11 @@ class YoutubeLoader:
|
|
|
234
271
|
"""
|
|
235
272
|
url = f"https://www.youtube.com/watch?v={video_id}"
|
|
236
273
|
|
|
237
|
-
# Use
|
|
274
|
+
# Use base opts (includes proxy and cookie config) + DASH manifest
|
|
238
275
|
opts = {
|
|
239
|
-
|
|
240
|
-
"no_warnings": True,
|
|
241
|
-
"skip_download": True,
|
|
242
|
-
"extract_flat": False,
|
|
276
|
+
**self._base_opts,
|
|
243
277
|
"youtube_include_dash_manifest": True,
|
|
244
278
|
}
|
|
245
|
-
if self.proxy:
|
|
246
|
-
opts["proxy"] = self.proxy
|
|
247
|
-
if self.cookies:
|
|
248
|
-
opts["cookiefile"] = self.cookies
|
|
249
279
|
|
|
250
280
|
try:
|
|
251
281
|
with yt_dlp.YoutubeDL(opts) as ydl:
|
|
@@ -253,18 +283,57 @@ class YoutubeLoader:
|
|
|
253
283
|
|
|
254
284
|
# Get all formats and filter for audio-only (no video track)
|
|
255
285
|
formats = info.get("formats", [])
|
|
286
|
+
|
|
287
|
+
def is_direct_url(url: str) -> bool:
|
|
288
|
+
"""Check if URL is a direct stream URL (not HLS manifest)"""
|
|
289
|
+
if not url:
|
|
290
|
+
return False
|
|
291
|
+
# HLS manifests contain these patterns
|
|
292
|
+
hls_patterns = ["manifest.googlevideo.com", "/hls_playlist/", ".m3u8"]
|
|
293
|
+
return not any(p in url for p in hls_patterns)
|
|
294
|
+
|
|
256
295
|
audio_formats = [
|
|
257
296
|
f
|
|
258
297
|
for f in formats
|
|
259
298
|
if f.get("acodec") not in (None, "none")
|
|
260
299
|
and f.get("vcodec") in (None, "none")
|
|
261
300
|
and f.get("url") # Must have a direct URL
|
|
301
|
+
and is_direct_url(f.get("url")) # Exclude HLS manifests
|
|
262
302
|
]
|
|
263
303
|
|
|
264
304
|
if not audio_formats:
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
)
|
|
305
|
+
# Fallback: If no audio-only formats, use lowest resolution video with audio
|
|
306
|
+
# This happens with HLS-only videos (e.g., protected content)
|
|
307
|
+
logger.warning("No audio-only formats found. Falling back to lowest resolution video with audio.")
|
|
308
|
+
audio_formats = [
|
|
309
|
+
f
|
|
310
|
+
for f in formats
|
|
311
|
+
if f.get("acodec") not in (None, "none")
|
|
312
|
+
and f.get("vcodec") not in (None, "none")
|
|
313
|
+
and f.get("url")
|
|
314
|
+
and is_direct_url(f.get("url")) # Exclude HLS manifests
|
|
315
|
+
]
|
|
316
|
+
# Sort by resolution (lowest first) for minimal bandwidth
|
|
317
|
+
audio_formats.sort(key=lambda f: f.get("height") or f.get("width") or 9999)
|
|
318
|
+
|
|
319
|
+
if not audio_formats:
|
|
320
|
+
# Check if there are HLS-only formats (common for Shorts)
|
|
321
|
+
# HLS can still work with server-side streaming (same IP)
|
|
322
|
+
hls_with_audio = [f for f in formats if f.get("acodec") not in (None, "none") and f.get("url")]
|
|
323
|
+
if hls_with_audio:
|
|
324
|
+
logger.warning("Only HLS streams available. Returning HLS URL for server-side streaming.")
|
|
325
|
+
# Sort: prefer audio-only, then by resolution (lowest first)
|
|
326
|
+
hls_with_audio.sort(
|
|
327
|
+
key=lambda f: (
|
|
328
|
+
0 if f.get("vcodec") in (None, "none") else 1,
|
|
329
|
+
f.get("height") or f.get("width") or 9999,
|
|
330
|
+
)
|
|
331
|
+
)
|
|
332
|
+
audio_formats = hls_with_audio
|
|
333
|
+
else:
|
|
334
|
+
raise YouTubeError(
|
|
335
|
+
"No formats with audio available. YouTube may require authentication for this video."
|
|
336
|
+
)
|
|
268
337
|
|
|
269
338
|
# Filter by audio_track_id if specified (for multi-language audio)
|
|
270
339
|
if audio_track_id:
|
|
@@ -314,17 +383,30 @@ class YoutubeLoader:
|
|
|
314
383
|
audio_formats.sort(key=score_format, reverse=True)
|
|
315
384
|
best = audio_formats[0]
|
|
316
385
|
|
|
386
|
+
# Check if selected format is HLS (requires server-side streaming)
|
|
387
|
+
best_url = best.get("url", "")
|
|
388
|
+
is_hls = not is_direct_url(best_url)
|
|
389
|
+
|
|
317
390
|
return {
|
|
318
|
-
"url":
|
|
391
|
+
"url": best_url,
|
|
319
392
|
"mime_type": best.get("ext", format_preference),
|
|
320
393
|
"bitrate": best.get("abr") or best.get("tbr"),
|
|
394
|
+
"sample_rate": best.get("asr"), # Audio sample rate
|
|
321
395
|
"content_length": best.get("filesize") or best.get("filesize_approx"),
|
|
322
396
|
"format_id": best.get("format_id"),
|
|
323
397
|
"ext": best.get("ext"),
|
|
398
|
+
"is_hls": is_hls, # True = use server streaming, False = use proxy
|
|
324
399
|
}
|
|
325
400
|
|
|
326
401
|
except yt_dlp.utils.DownloadError as e:
|
|
327
|
-
|
|
402
|
+
msg = str(e)
|
|
403
|
+
if "Sign in to confirm" in msg or "not a bot" in msg:
|
|
404
|
+
raise YouTubeError(
|
|
405
|
+
f"🤖 YouTube Bot Detection: Cookie configuration required to access this video. "
|
|
406
|
+
f"Reference: YoutubeLoader(cookies='chrome') or set environment variable YOUTUBE_COOKIE_BROWSER=chrome. "
|
|
407
|
+
f"Original error: {msg}"
|
|
408
|
+
) from e
|
|
409
|
+
raise YouTubeError(f"Failed to get audio URL: {msg}") from e
|
|
328
410
|
except Exception as e:
|
|
329
411
|
raise YouTubeError(f"Unexpected error getting audio URL: {str(e)}") from e
|
|
330
412
|
|
|
@@ -346,18 +428,12 @@ class YoutubeLoader:
|
|
|
346
428
|
"""
|
|
347
429
|
url = f"https://www.youtube.com/watch?v={video_id}"
|
|
348
430
|
|
|
349
|
-
# Use
|
|
431
|
+
# Use base opts (includes proxy and cookie config) + DASH and HLS manifests
|
|
350
432
|
opts = {
|
|
351
|
-
|
|
352
|
-
"no_warnings": True,
|
|
353
|
-
"skip_download": True,
|
|
354
|
-
"extract_flat": False,
|
|
433
|
+
**self._base_opts,
|
|
355
434
|
"youtube_include_dash_manifest": True,
|
|
435
|
+
"youtube_include_hls_manifest": True,
|
|
356
436
|
}
|
|
357
|
-
if self.proxy:
|
|
358
|
-
opts["proxy"] = self.proxy
|
|
359
|
-
if self.cookies:
|
|
360
|
-
opts["cookiefile"] = self.cookies
|
|
361
437
|
|
|
362
438
|
try:
|
|
363
439
|
with yt_dlp.YoutubeDL(opts) as ydl:
|
|
@@ -366,27 +442,34 @@ class YoutubeLoader:
|
|
|
366
442
|
# Get all formats
|
|
367
443
|
formats = info.get("formats", [])
|
|
368
444
|
|
|
445
|
+
def is_direct_url(url: str) -> bool:
|
|
446
|
+
"""Check if URL is a direct stream URL (not HLS manifest)"""
|
|
447
|
+
if not url:
|
|
448
|
+
return False
|
|
449
|
+
hls_patterns = ["manifest.googlevideo.com", "/hls_playlist/", ".m3u8"]
|
|
450
|
+
return not any(p in url for p in hls_patterns)
|
|
451
|
+
|
|
369
452
|
# Filter for video formats:
|
|
370
453
|
# - Must have video codec
|
|
371
|
-
# - Must have
|
|
372
|
-
# -
|
|
373
|
-
def
|
|
454
|
+
# - Must have a URL
|
|
455
|
+
# - Prefer direct URLs (DASH) over HLS manifests
|
|
456
|
+
def is_usable_video(f: Dict) -> bool:
|
|
374
457
|
if f.get("vcodec") in (None, "none"):
|
|
375
458
|
return False
|
|
376
|
-
|
|
377
|
-
protocol = f.get("protocol", "")
|
|
378
|
-
# Exclude HLS manifests
|
|
379
|
-
if "m3u8" in protocol or ".m3u8" in url or "manifest.googlevideo.com" in url:
|
|
380
|
-
return False
|
|
381
|
-
# Exclude DASH manifests
|
|
382
|
-
if "dash" in protocol:
|
|
459
|
+
if not f.get("url"):
|
|
383
460
|
return False
|
|
384
461
|
return True
|
|
385
462
|
|
|
386
|
-
|
|
463
|
+
# First try: direct URLs only (exclude HLS)
|
|
464
|
+
video_formats = [f for f in formats if is_usable_video(f) and is_direct_url(f.get("url", ""))]
|
|
387
465
|
|
|
466
|
+
# Fallback: include HLS if no direct formats
|
|
388
467
|
if not video_formats:
|
|
389
|
-
|
|
468
|
+
logger.warning("No direct video URLs found. Falling back to HLS formats.")
|
|
469
|
+
video_formats = [f for f in formats if is_usable_video(f)]
|
|
470
|
+
|
|
471
|
+
if not video_formats:
|
|
472
|
+
raise YouTubeError("No video formats available")
|
|
390
473
|
|
|
391
474
|
# Parse target height from quality parameter
|
|
392
475
|
target_height = None
|
|
@@ -413,15 +496,19 @@ class YoutubeLoader:
|
|
|
413
496
|
video_formats.sort(key=score_format, reverse=True)
|
|
414
497
|
best = video_formats[0]
|
|
415
498
|
|
|
499
|
+
# Check if selected format is HLS
|
|
500
|
+
best_url = best.get("url", "")
|
|
501
|
+
is_hls = not is_direct_url(best_url)
|
|
502
|
+
|
|
416
503
|
# Log selection for debugging
|
|
417
504
|
logger.info(
|
|
418
505
|
f"Selected video format: {best.get('format_id')} "
|
|
419
506
|
f"({best.get('width')}x{best.get('height')}, "
|
|
420
|
-
f"vcodec={best.get('vcodec')}, acodec={best.get('acodec')})"
|
|
507
|
+
f"vcodec={best.get('vcodec')}, acodec={best.get('acodec')}, is_hls={is_hls})"
|
|
421
508
|
)
|
|
422
509
|
|
|
423
510
|
return {
|
|
424
|
-
"url":
|
|
511
|
+
"url": best_url,
|
|
425
512
|
"mime_type": best.get("ext", format_preference),
|
|
426
513
|
"width": best.get("width"),
|
|
427
514
|
"height": best.get("height"),
|
|
@@ -432,10 +519,18 @@ class YoutubeLoader:
|
|
|
432
519
|
"content_length": best.get("filesize") or best.get("filesize_approx"),
|
|
433
520
|
"format_id": best.get("format_id"),
|
|
434
521
|
"ext": best.get("ext"),
|
|
522
|
+
"is_hls": is_hls,
|
|
435
523
|
}
|
|
436
524
|
|
|
437
525
|
except yt_dlp.utils.DownloadError as e:
|
|
438
|
-
|
|
526
|
+
msg = str(e)
|
|
527
|
+
if "Sign in to confirm" in msg or "not a bot" in msg:
|
|
528
|
+
raise YouTubeError(
|
|
529
|
+
f"🤖 YouTube Bot Detection: Cookie configuration required to access this video. "
|
|
530
|
+
f"Reference: YoutubeLoader(cookies='chrome') or set environment variable YOUTUBE_COOKIE_BROWSER=chrome. "
|
|
531
|
+
f"Original error: {msg}"
|
|
532
|
+
) from e
|
|
533
|
+
raise YouTubeError(f"Failed to get video URL: {msg}") from e
|
|
439
534
|
except Exception as e:
|
|
440
535
|
raise YouTubeError(f"Unexpected error getting video URL: {str(e)}") from e
|
|
441
536
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: lattifai
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.3.0
|
|
4
4
|
Summary: Lattifai Python SDK: Seamless Integration with Lattifai's Speech and Video AI Services
|
|
5
5
|
Author-email: Lattifai Technologies <tech@lattifai.com>
|
|
6
6
|
Maintainer-email: Lattice <tech@lattifai.com>
|
|
@@ -50,49 +50,40 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
50
50
|
Requires-Python: <3.15,>=3.10
|
|
51
51
|
Description-Content-Type: text/markdown
|
|
52
52
|
License-File: LICENSE
|
|
53
|
-
Requires-Dist:
|
|
54
|
-
Requires-Dist:
|
|
55
|
-
Requires-Dist: lattifai
|
|
56
|
-
Requires-Dist: lattifai[
|
|
57
|
-
Requires-Dist: lattifai
|
|
58
|
-
|
|
59
|
-
Requires-Dist:
|
|
60
|
-
Requires-Dist:
|
|
61
|
-
Requires-Dist:
|
|
62
|
-
Requires-Dist:
|
|
63
|
-
Requires-Dist:
|
|
64
|
-
Requires-Dist:
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
Requires-Dist:
|
|
69
|
-
Requires-Dist:
|
|
70
|
-
Requires-Dist: praatio; extra == "alignment"
|
|
71
|
-
Requires-Dist: tgt; extra == "alignment"
|
|
72
|
-
Requires-Dist: onnx>=1.16.0; extra == "alignment"
|
|
73
|
-
Requires-Dist: onnxruntime; extra == "alignment"
|
|
74
|
-
Requires-Dist: g2p-phonemizer>=0.4.0; extra == "alignment"
|
|
75
|
-
Requires-Dist: wtpsplit>=2.1.7; extra == "alignment"
|
|
76
|
-
Requires-Dist: modelscope>=1.33.0; extra == "alignment"
|
|
77
|
-
Requires-Dist: error-align-fix>=0.1.4; extra == "alignment"
|
|
53
|
+
Requires-Dist: python-dotenv
|
|
54
|
+
Requires-Dist: colorful>=0.5.6
|
|
55
|
+
Requires-Dist: lattifai-run>=1.0.1
|
|
56
|
+
Requires-Dist: lattifai-captions[splitting]>=0.1.6
|
|
57
|
+
Requires-Dist: lattifai-core-hq>=0.6.4
|
|
58
|
+
Requires-Dist: g2p-phonemizer>=0.4.0
|
|
59
|
+
Requires-Dist: error-align-fix>=0.1.4
|
|
60
|
+
Requires-Dist: lhotse>=1.26.0
|
|
61
|
+
Requires-Dist: k2py==0.2.4
|
|
62
|
+
Requires-Dist: onnxruntime
|
|
63
|
+
Requires-Dist: av
|
|
64
|
+
Requires-Dist: msgpack
|
|
65
|
+
Provides-Extra: event
|
|
66
|
+
Requires-Dist: pyannote-audio-notorchdeps>=4.0.2; extra == "event"
|
|
67
|
+
Provides-Extra: diarization
|
|
68
|
+
Requires-Dist: nemo_toolkit_asr[asr]>=2.7.0rc4; extra == "diarization"
|
|
69
|
+
Requires-Dist: pyannote-audio-notorchdeps>=4.0.2; extra == "diarization"
|
|
78
70
|
Provides-Extra: transcription
|
|
79
71
|
Requires-Dist: OmniSenseVoice>=0.4.2; extra == "transcription"
|
|
80
72
|
Requires-Dist: nemo_toolkit_asr[asr]>=2.7.0rc4; extra == "transcription"
|
|
81
73
|
Requires-Dist: google-genai>=1.22.0; extra == "transcription"
|
|
82
74
|
Requires-Dist: pyannote-audio-notorchdeps>=4.0.2; extra == "transcription"
|
|
83
|
-
Provides-Extra:
|
|
84
|
-
Requires-Dist: questionary>=2.0; extra == "
|
|
85
|
-
Requires-Dist: yt-dlp; extra == "
|
|
86
|
-
Requires-Dist: pycryptodome; extra == "
|
|
87
|
-
Provides-Extra: server
|
|
88
|
-
Requires-Dist: fastapi>=0.111.0; extra == "server"
|
|
89
|
-
Requires-Dist: uvicorn>=0.30.0; extra == "server"
|
|
90
|
-
Requires-Dist: python-multipart>=0.0.9; extra == "server"
|
|
91
|
-
Requires-Dist: jinja2>=3.1.4; extra == "server"
|
|
75
|
+
Provides-Extra: youtube
|
|
76
|
+
Requires-Dist: questionary>=2.0; extra == "youtube"
|
|
77
|
+
Requires-Dist: yt-dlp; extra == "youtube"
|
|
78
|
+
Requires-Dist: pycryptodome; extra == "youtube"
|
|
92
79
|
Provides-Extra: dev
|
|
80
|
+
Requires-Dist: black; extra == "dev"
|
|
93
81
|
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
94
82
|
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
|
95
83
|
Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
|
|
84
|
+
Provides-Extra: all
|
|
85
|
+
Requires-Dist: lattifai[transcription]; extra == "all"
|
|
86
|
+
Requires-Dist: lattifai[youtube]; extra == "all"
|
|
96
87
|
Dynamic: license-file
|
|
97
88
|
|
|
98
89
|
<div align="center">
|
|
@@ -104,13 +95,13 @@ Dynamic: license-file
|
|
|
104
95
|
</div>
|
|
105
96
|
|
|
106
97
|
<p align="center">
|
|
107
|
-
🌐 <a href="https://lattifai.com"><b>Official Website</b></a>  
|
|
98
|
+
🌐 <a href="https://lattifai.com"><b>Official Website</b></a> | 🖥️ <a href="https://github.com/lattifai/lattifai-python">GitHub</a> | 🤗 <a href="https://huggingface.co/LattifAI/Lattice-1">Model</a> | 📑 <a href="https://lattifai.com/blogs">Blog</a> | <a href="https://discord.gg/kvF4WsBRK8"><img src="https://img.shields.io/badge/Discord-Join-5865F2?logo=discord&logoColor=white" alt="Discord" style="vertical-align: middle;"></a>
|
|
108
99
|
</p>
|
|
109
100
|
|
|
110
101
|
|
|
111
102
|
# LattifAI: Precision Alignment, Infinite Possibilities
|
|
112
103
|
|
|
113
|
-
Advanced forced alignment and subtitle generation powered by [ 🤗 Lattice-1](https://huggingface.co/
|
|
104
|
+
Advanced forced alignment and subtitle generation powered by [ 🤗 Lattice-1](https://huggingface.co/LattifAI/Lattice-1) model.
|
|
114
105
|
|
|
115
106
|
## Table of Contents
|
|
116
107
|
|
|
@@ -120,6 +111,7 @@ Advanced forced alignment and subtitle generation powered by [ 🤗 Lattice-1](h
|
|
|
120
111
|
- [CLI Reference](#cli-reference)
|
|
121
112
|
- [Python SDK](#python-sdk)
|
|
122
113
|
- [Advanced Features](#advanced-features)
|
|
114
|
+
- [Text Processing](#text-processing)
|
|
123
115
|
- [Supported Formats & Languages](#supported-formats--languages)
|
|
124
116
|
- [Roadmap](#roadmap)
|
|
125
117
|
- [Development](#development)
|
|
@@ -130,7 +122,7 @@ Advanced forced alignment and subtitle generation powered by [ 🤗 Lattice-1](h
|
|
|
130
122
|
|
|
131
123
|
| Feature | Description |
|
|
132
124
|
|---------|-------------|
|
|
133
|
-
| **Forced Alignment** | Word-level and segment-level audio-text synchronization powered by [Lattice-1](https://huggingface.co/
|
|
125
|
+
| **Forced Alignment** | Word-level and segment-level audio-text synchronization powered by [Lattice-1](https://huggingface.co/LattifAI/Lattice-1) |
|
|
134
126
|
| **Multi-Model Transcription** | Gemini (100+ languages), Parakeet (24 languages), SenseVoice (5 languages) |
|
|
135
127
|
| **Speaker Diarization** | Multi-speaker identification with label preservation |
|
|
136
128
|
| **Streaming Mode** | Process audio up to 20 hours with minimal memory |
|
|
@@ -138,10 +130,10 @@ Advanced forced alignment and subtitle generation powered by [ 🤗 Lattice-1](h
|
|
|
138
130
|
|
|
139
131
|
### Alignment Models
|
|
140
132
|
|
|
141
|
-
| Model | Languages | Description |
|
|
142
|
-
|
|
143
|
-
| **Lattice-1** | English, Chinese, German | Production model with mixed-language alignment support |
|
|
144
|
-
| **Lattice-1-Alpha** | English | Initial release with English forced alignment |
|
|
133
|
+
| Model | Links | Languages | Description |
|
|
134
|
+
|-------|-------|-----------|-------------|
|
|
135
|
+
| **Lattice-1** | [🤗 HF](https://huggingface.co/LattifAI/Lattice-1) • [🤖 MS](https://modelscope.cn/models/LattifAI/Lattice-1) | English, Chinese, German | Production model with mixed-language alignment support |
|
|
136
|
+
| **Lattice-1-Alpha** | [🤗 HF](https://huggingface.co/LattifAI/Lattice-1-Alpha) • [🤖 MS](https://modelscope.cn/models/LattifAI/Lattice-1-Alpha) | English | Initial release with English forced alignment |
|
|
145
137
|
|
|
146
138
|
**Model Hub**: Models can be downloaded from `huggingface` (default) or `modelscope` (recommended for users in China):
|
|
147
139
|
|
|
@@ -151,7 +143,8 @@ lai alignment align audio.wav caption.srt output.srt alignment.model_hub=modelsc
|
|
|
151
143
|
```
|
|
152
144
|
|
|
153
145
|
```python
|
|
154
|
-
from lattifai import LattifAI
|
|
146
|
+
from lattifai.client import LattifAI
|
|
147
|
+
from lattifai.config import AlignmentConfig
|
|
155
148
|
|
|
156
149
|
client = LattifAI(alignment_config=AlignmentConfig(model_hub="modelscope"))
|
|
157
150
|
```
|
|
@@ -173,16 +166,34 @@ uvx --from lattifai lai --help
|
|
|
173
166
|
|
|
174
167
|
# Or create a project
|
|
175
168
|
mkdir my-project && cd my-project
|
|
176
|
-
uv init --bare && uv add lattifai
|
|
169
|
+
uv init --bare && uv add "lattifai[all]"
|
|
177
170
|
uv run lai alignment align audio.wav caption.srt output.srt
|
|
178
171
|
```
|
|
179
172
|
|
|
180
173
|
### Using pip
|
|
181
174
|
|
|
182
175
|
```bash
|
|
183
|
-
|
|
176
|
+
# Full installation (recommended)
|
|
177
|
+
pip install "lattifai[all]"
|
|
184
178
|
```
|
|
185
179
|
|
|
180
|
+
### Installation Options
|
|
181
|
+
|
|
182
|
+
| Extra | Command | Includes |
|
|
183
|
+
|-------|---------|----------|
|
|
184
|
+
| (base) | `pip install lattifai` | Forced alignment (Lattice-1, k2py, ONNX, captions) |
|
|
185
|
+
| `all` | `pip install "lattifai[all]"` | Base + transcription + youtube |
|
|
186
|
+
| `transcription` | `pip install "lattifai[transcription]"` | ASR models (Gemini, Parakeet, SenseVoice) |
|
|
187
|
+
| `youtube` | `pip install "lattifai[youtube]"` | YouTube download (yt-dlp) |
|
|
188
|
+
| `diarization` | `pip install "lattifai[diarization]"` | Speaker diarization (NeMo, pyannote) |
|
|
189
|
+
| `event` | `pip install "lattifai[event]"` | Audio event detection |
|
|
190
|
+
|
|
191
|
+
**Note:** Base installation includes full alignment functionality. Use `[all]` for transcription and YouTube features.
|
|
192
|
+
|
|
193
|
+
### Caption Format Support
|
|
194
|
+
|
|
195
|
+
Caption/subtitle format parsing is provided by [lattifai-captions](https://github.com/lattifai/captions), a separate package supporting 30+ formats (SRT, VTT, ASS, TTML, TextGrid, NLE formats, etc.). It is automatically installed with `lattifai[core]` or `lattifai[all]`.
|
|
196
|
+
|
|
186
197
|
### API Keys
|
|
187
198
|
|
|
188
199
|
**LattifAI API Key (Required)** - Get your free key at [lattifai.com/dashboard/api-keys](https://lattifai.com/dashboard/api-keys)
|
|
@@ -220,7 +231,7 @@ lai alignment youtube "https://youtube.com/watch?v=VIDEO_ID"
|
|
|
220
231
|
### Python SDK
|
|
221
232
|
|
|
222
233
|
```python
|
|
223
|
-
from lattifai import LattifAI
|
|
234
|
+
from lattifai.client import LattifAI
|
|
224
235
|
|
|
225
236
|
client = LattifAI()
|
|
226
237
|
caption = client.alignment(
|
|
@@ -319,8 +330,8 @@ lai transcribe align audio.wav output.srt \
|
|
|
319
330
|
### Configuration Objects
|
|
320
331
|
|
|
321
332
|
```python
|
|
322
|
-
from lattifai import
|
|
323
|
-
|
|
333
|
+
from lattifai.client import LattifAI
|
|
334
|
+
from lattifai.config import (
|
|
324
335
|
ClientConfig,
|
|
325
336
|
AlignmentConfig,
|
|
326
337
|
CaptionConfig,
|
|
@@ -365,7 +376,8 @@ caption = client.youtube(
|
|
|
365
376
|
| `include_speaker_in_text` | `True` | Include speaker labels in text output |
|
|
366
377
|
|
|
367
378
|
```python
|
|
368
|
-
from lattifai import LattifAI
|
|
379
|
+
from lattifai.client import LattifAI
|
|
380
|
+
from lattifai.config import CaptionConfig
|
|
369
381
|
|
|
370
382
|
client = LattifAI(
|
|
371
383
|
caption_config=CaptionConfig(
|
|
@@ -396,6 +408,9 @@ caption = client.alignment(
|
|
|
396
408
|
### Word-Level Alignment
|
|
397
409
|
|
|
398
410
|
```python
|
|
411
|
+
from lattifai.client import LattifAI
|
|
412
|
+
from lattifai.config import CaptionConfig
|
|
413
|
+
|
|
399
414
|
client = LattifAI(caption_config=CaptionConfig(word_level=True))
|
|
400
415
|
caption = client.alignment(
|
|
401
416
|
input_media="audio.wav",
|
|
@@ -420,7 +435,8 @@ Automatically identify and label different speakers in audio.
|
|
|
420
435
|
- Gemini transcription → Names extracted from context (e.g., "Hi, I'm Alice" → `Alice`)
|
|
421
436
|
|
|
422
437
|
```python
|
|
423
|
-
from lattifai import LattifAI
|
|
438
|
+
from lattifai.client import LattifAI
|
|
439
|
+
from lattifai.config import DiarizationConfig
|
|
424
440
|
|
|
425
441
|
client = LattifAI(
|
|
426
442
|
diarization_config=DiarizationConfig(
|
|
@@ -453,6 +469,51 @@ Input Caption → Reader → Tokenizer
|
|
|
453
469
|
|
|
454
470
|
---
|
|
455
471
|
|
|
472
|
+
## Text Processing
|
|
473
|
+
|
|
474
|
+
The tokenizer handles various text patterns for forced alignment.
|
|
475
|
+
|
|
476
|
+
### Bracket/Caption Handling
|
|
477
|
+
|
|
478
|
+
Visual captions and annotations in brackets are treated specially - they get **two pronunciation paths** so the aligner can choose:
|
|
479
|
+
1. **Silence path** - skip when content doesn't appear in audio
|
|
480
|
+
2. **Inner text pronunciation** - match if someone actually says the words
|
|
481
|
+
|
|
482
|
+
| Bracket Type | Symbol | Example | Alignment Behavior |
|
|
483
|
+
|--------------|--------|---------|-------------------|
|
|
484
|
+
| Half-width square | `[]` | `[APPLAUSE]` | Skip or match "applause" |
|
|
485
|
+
| Half-width paren | `()` | `(music)` | Skip or match "music" |
|
|
486
|
+
| Full-width square | `【】` | `【笑声】` | Skip or match "笑声" |
|
|
487
|
+
| Full-width paren | `()` | `(音乐)` | Skip or match "音乐" |
|
|
488
|
+
| Angle brackets | `<>` | `<intro>` | Skip or match "intro" |
|
|
489
|
+
| Book title marks | `《》` | `《开场白》` | Skip or match "开场白" |
|
|
490
|
+
|
|
491
|
+
This allows proper handling of:
|
|
492
|
+
- **Visual descriptions**: `[Barret adjusts the camera and smiles]` → skipped if not spoken
|
|
493
|
+
- **Sound effects**: `[APPLAUSE]`, `(music)` → matched if audible
|
|
494
|
+
- **Chinese annotations**: `【笑声】`, `(鼓掌)` → flexible alignment
|
|
495
|
+
|
|
496
|
+
### Multilingual Text
|
|
497
|
+
|
|
498
|
+
| Pattern | Handling | Example |
|
|
499
|
+
|---------|----------|---------|
|
|
500
|
+
| CJK characters | Split individually | `你好` → `["你", "好"]` |
|
|
501
|
+
| Latin words | Grouped with accents | `Kühlschrank` → `["Kühlschrank"]` |
|
|
502
|
+
| Contractions | Kept together | `I'm`, `don't`, `we'll` |
|
|
503
|
+
| Punctuation | Attached to words | `Hello,` `world!` |
|
|
504
|
+
|
|
505
|
+
### Speaker Labels
|
|
506
|
+
|
|
507
|
+
Recognized speaker patterns are preserved during alignment:
|
|
508
|
+
|
|
509
|
+
| Format | Example | Output |
|
|
510
|
+
|--------|---------|--------|
|
|
511
|
+
| Arrow prefix | `>> Alice:` or `>> Alice:` | `[Alice]` |
|
|
512
|
+
| LattifAI format | `[SPEAKER_01]:` | `[SPEAKER_01]` |
|
|
513
|
+
| Uppercase name | `SPEAKER NAME:` | `[SPEAKER NAME]` |
|
|
514
|
+
|
|
515
|
+
---
|
|
516
|
+
|
|
456
517
|
## Supported Formats & Languages
|
|
457
518
|
|
|
458
519
|
### Media Formats
|
|
@@ -461,7 +522,9 @@ Input Caption → Reader → Tokenizer
|
|
|
461
522
|
|------|---------|
|
|
462
523
|
| **Audio** | WAV, MP3, M4A, AAC, FLAC, OGG, OPUS, AIFF, and more |
|
|
463
524
|
| **Video** | MP4, MKV, MOV, WEBM, AVI, and more |
|
|
464
|
-
| **Caption** | SRT, VTT, ASS, SSA, JSON, TextGrid, TSV, CSV, LRC, TTML, and more |
|
|
525
|
+
| **Caption** | SRT, VTT, ASS, SSA, SRV3, JSON, TextGrid, TSV, CSV, LRC, TTML, and more |
|
|
526
|
+
|
|
527
|
+
> **Note**: Caption format handling is provided by [lattifai-captions](https://github.com/lattifai/captions), which is automatically installed as a dependency. For standalone caption processing without alignment features, install `pip install lattifai-captions`.
|
|
465
528
|
|
|
466
529
|
### JSON Format
|
|
467
530
|
|
|
@@ -515,8 +578,8 @@ WEBVTT
|
|
|
515
578
|
**Writing**: Use `word_level=True` with `karaoke_config` to output YouTube VTT style:
|
|
516
579
|
|
|
517
580
|
```python
|
|
518
|
-
from lattifai import Caption
|
|
519
|
-
from lattifai.config
|
|
581
|
+
from lattifai.caption import Caption
|
|
582
|
+
from lattifai.caption.config import KaraokeConfig
|
|
520
583
|
|
|
521
584
|
caption = Caption.read("input.vtt")
|
|
522
585
|
caption.write(
|
|
@@ -584,7 +647,7 @@ cd lattifai-python
|
|
|
584
647
|
uv sync && source .venv/bin/activate
|
|
585
648
|
|
|
586
649
|
# Or pip
|
|
587
|
-
pip install -e ".[
|
|
650
|
+
pip install -e ".[all,dev]"
|
|
588
651
|
|
|
589
652
|
# Run tests
|
|
590
653
|
pytest
|