lattifai 1.2.2__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. lattifai/_init.py +20 -0
  2. lattifai/alignment/__init__.py +2 -3
  3. lattifai/alignment/lattice1_aligner.py +117 -4
  4. lattifai/alignment/lattice1_worker.py +47 -4
  5. lattifai/alignment/segmenter.py +3 -2
  6. lattifai/alignment/text_align.py +2 -1
  7. lattifai/alignment/tokenizer.py +56 -29
  8. lattifai/audio2.py +162 -183
  9. lattifai/cli/alignment.py +5 -0
  10. lattifai/cli/caption.py +6 -6
  11. lattifai/cli/transcribe.py +1 -5
  12. lattifai/cli/youtube.py +3 -0
  13. lattifai/client.py +41 -12
  14. lattifai/config/__init__.py +21 -3
  15. lattifai/config/alignment.py +7 -0
  16. lattifai/config/caption.py +13 -243
  17. lattifai/config/client.py +16 -0
  18. lattifai/config/event.py +102 -0
  19. lattifai/config/transcription.py +25 -1
  20. lattifai/data/__init__.py +8 -0
  21. lattifai/data/caption.py +228 -0
  22. lattifai/errors.py +78 -53
  23. lattifai/event/__init__.py +65 -0
  24. lattifai/event/lattifai.py +166 -0
  25. lattifai/mixin.py +22 -17
  26. lattifai/transcription/base.py +2 -1
  27. lattifai/transcription/gemini.py +147 -16
  28. lattifai/transcription/lattifai.py +8 -11
  29. lattifai/types.py +1 -1
  30. lattifai/youtube/client.py +143 -48
  31. {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/METADATA +117 -54
  32. lattifai-1.3.0.dist-info/RECORD +57 -0
  33. lattifai/__init__.py +0 -88
  34. lattifai/alignment/sentence_splitter.py +0 -350
  35. lattifai/caption/__init__.py +0 -96
  36. lattifai/caption/caption.py +0 -661
  37. lattifai/caption/formats/__init__.py +0 -199
  38. lattifai/caption/formats/base.py +0 -211
  39. lattifai/caption/formats/gemini.py +0 -722
  40. lattifai/caption/formats/json.py +0 -194
  41. lattifai/caption/formats/lrc.py +0 -309
  42. lattifai/caption/formats/nle/__init__.py +0 -9
  43. lattifai/caption/formats/nle/audition.py +0 -561
  44. lattifai/caption/formats/nle/avid.py +0 -423
  45. lattifai/caption/formats/nle/fcpxml.py +0 -549
  46. lattifai/caption/formats/nle/premiere.py +0 -589
  47. lattifai/caption/formats/pysubs2.py +0 -642
  48. lattifai/caption/formats/sbv.py +0 -147
  49. lattifai/caption/formats/tabular.py +0 -338
  50. lattifai/caption/formats/textgrid.py +0 -193
  51. lattifai/caption/formats/ttml.py +0 -652
  52. lattifai/caption/formats/vtt.py +0 -469
  53. lattifai/caption/parsers/__init__.py +0 -9
  54. lattifai/caption/parsers/text_parser.py +0 -147
  55. lattifai/caption/standardize.py +0 -636
  56. lattifai/caption/supervision.py +0 -34
  57. lattifai/caption/utils.py +0 -474
  58. lattifai-1.2.2.dist-info/RECORD +0 -76
  59. {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/WHEEL +0 -0
  60. {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/entry_points.txt +0 -0
  61. {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/licenses/LICENSE +0 -0
  62. {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/top_level.txt +0 -0
@@ -15,7 +15,8 @@ try:
15
15
  except ImportError:
16
16
  yt_dlp = None
17
17
 
18
- from ..config.caption import CAPTION_FORMATS
18
+ from lattifai.caption.config import CAPTION_FORMATS
19
+
19
20
  from ..errors import LattifAIError
20
21
  from ..workflow.base import setup_workflow_logger
21
22
  from ..workflow.file_manager import TRANSCRIBE_CHOICE, FileExistenceManager
@@ -49,6 +50,12 @@ class YoutubeLoader:
49
50
  if yt_dlp is None:
50
51
  raise ImportError("yt-dlp is required. Install with `pip install yt-dlp`")
51
52
 
53
+ # Auto-load from environment if not specified
54
+ if proxy is None:
55
+ proxy = os.getenv("YOUTUBE_PROXY")
56
+ if cookies is None:
57
+ cookies = os.getenv("YOUTUBE_COOKIE_FILE") or os.getenv("YOUTUBE_COOKIE_BROWSER")
58
+
52
59
  self.proxy = proxy
53
60
  self.cookies = cookies
54
61
 
@@ -64,14 +71,29 @@ class YoutubeLoader:
64
71
 
65
72
  if self.proxy:
66
73
  self._base_opts["proxy"] = self.proxy
74
+ logger.info(f"🌐 Using proxy: {self.proxy}")
67
75
 
76
+ # Cookie configuration
68
77
  if self.cookies:
69
- self._base_opts["cookiefile"] = self.cookies
78
+ # Check if it's a browser name (chrome, firefox, safari, etc.)
79
+ browser_names = ["chrome", "firefox", "safari", "edge", "opera", "brave"]
80
+ if self.cookies.lower() in browser_names:
81
+ # Use cookies from browser directly
82
+ self._base_opts["cookiesfrombrowser"] = (self.cookies.lower(),)
83
+ logger.info(f"🍪 Using cookies from browser: {self.cookies}")
84
+ else:
85
+ # Use cookie file
86
+ cookie_path = Path(self.cookies).expanduser()
87
+ if cookie_path.exists():
88
+ self._base_opts["cookiefile"] = str(cookie_path)
89
+ logger.info(f"🍪 Using cookie file: {cookie_path}")
90
+ else:
91
+ logger.warning(f"⚠️ Cookie file not found: {cookie_path}")
92
+ logger.warning("💡 Tip: Run 'yt-dlp --cookies-from-browser chrome' to extract cookies")
70
93
 
71
- # Strategy: Prefer Android client to avoid PO Token issues on Web
72
- # But for captions, sometimes Web is needed.
73
- # We start with a robust default.
74
- self._base_opts["extractor_args"] = {"youtube": {"player_client": ["android", "web"]}}
94
+ # Note: player_client configuration is removed to avoid format availability issues
95
+ # with certain videos. Let yt-dlp automatically select the best client.
96
+ # Previous config caused "Requested format is not available" errors for some videos.
75
97
 
76
98
  def get_video_info(self, video_id: str) -> Dict[str, Any]:
77
99
  """
@@ -136,8 +158,23 @@ class YoutubeLoader:
136
158
 
137
159
  except yt_dlp.utils.DownloadError as e:
138
160
  msg = str(e)
139
- if "Sign in to confirm" in msg or "Private video" in msg:
140
- raise VideoUnavailableError(f"Video {video_id} is unavailable: {msg}")
161
+ if "Sign in to confirm" in msg or "not a bot" in msg:
162
+ # Bot detection error - provide helpful guidance
163
+ error_msg = (
164
+ f"🤖 YouTube Bot Detection: Video {video_id} requires authentication.\n\n"
165
+ "Solutions:\n"
166
+ "1. Use browser cookies (recommended):\n"
167
+ " loader = YoutubeLoader(cookies='chrome') # or 'firefox', 'safari'\n\n"
168
+ "2. Export cookie file:\n"
169
+ " yt-dlp --cookies-from-browser chrome --cookies cookies.txt <video_url>\n"
170
+ " loader = YoutubeLoader(cookies='cookies.txt')\n\n"
171
+ "3. Environment variable:\n"
172
+ " export YOUTUBE_COOKIE_BROWSER=chrome\n\n"
173
+ f"Original error: {msg}"
174
+ )
175
+ raise VideoUnavailableError(error_msg) from e
176
+ elif "Private video" in msg:
177
+ raise VideoUnavailableError(f"Video {video_id} is private") from e
141
178
  raise YouTubeError(f"yt-dlp failed: {msg}") from e
142
179
  except Exception as e:
143
180
  raise YouTubeError(f"Unexpected error: {str(e)}") from e
@@ -192,8 +229,8 @@ class YoutubeLoader:
192
229
  return "Unknown"
193
230
 
194
231
  def _find_best_format(self, formats: List[Dict]) -> Optional[Dict]:
195
- # Prefer json3, then vtt
196
- priority = ["json3", "vtt", "ttml", "srv3", "srv2", "srv1"]
232
+ # Prefer json3 (best precision), srv3 (word-level timing), then vtt
233
+ priority = ["json3", "srv3", "vtt", "ttml", "srv2", "srv1"]
197
234
 
198
235
  for fmt_ext in priority:
199
236
  for f in formats:
@@ -234,18 +271,11 @@ class YoutubeLoader:
234
271
  """
235
272
  url = f"https://www.youtube.com/watch?v={video_id}"
236
273
 
237
- # Use default yt-dlp config to get DASH formats with separate audio streams
274
+ # Use base opts (includes proxy and cookie config) + DASH manifest
238
275
  opts = {
239
- "quiet": True,
240
- "no_warnings": True,
241
- "skip_download": True,
242
- "extract_flat": False,
276
+ **self._base_opts,
243
277
  "youtube_include_dash_manifest": True,
244
278
  }
245
- if self.proxy:
246
- opts["proxy"] = self.proxy
247
- if self.cookies:
248
- opts["cookiefile"] = self.cookies
249
279
 
250
280
  try:
251
281
  with yt_dlp.YoutubeDL(opts) as ydl:
@@ -253,18 +283,57 @@ class YoutubeLoader:
253
283
 
254
284
  # Get all formats and filter for audio-only (no video track)
255
285
  formats = info.get("formats", [])
286
+
287
+ def is_direct_url(url: str) -> bool:
288
+ """Check if URL is a direct stream URL (not HLS manifest)"""
289
+ if not url:
290
+ return False
291
+ # HLS manifests contain these patterns
292
+ hls_patterns = ["manifest.googlevideo.com", "/hls_playlist/", ".m3u8"]
293
+ return not any(p in url for p in hls_patterns)
294
+
256
295
  audio_formats = [
257
296
  f
258
297
  for f in formats
259
298
  if f.get("acodec") not in (None, "none")
260
299
  and f.get("vcodec") in (None, "none")
261
300
  and f.get("url") # Must have a direct URL
301
+ and is_direct_url(f.get("url")) # Exclude HLS manifests
262
302
  ]
263
303
 
264
304
  if not audio_formats:
265
- raise YouTubeError(
266
- "No audio-only formats available. " "YouTube may require authentication for this video."
267
- )
305
+ # Fallback: If no audio-only formats, use lowest resolution video with audio
306
+ # This happens with HLS-only videos (e.g., protected content)
307
+ logger.warning("No audio-only formats found. Falling back to lowest resolution video with audio.")
308
+ audio_formats = [
309
+ f
310
+ for f in formats
311
+ if f.get("acodec") not in (None, "none")
312
+ and f.get("vcodec") not in (None, "none")
313
+ and f.get("url")
314
+ and is_direct_url(f.get("url")) # Exclude HLS manifests
315
+ ]
316
+ # Sort by resolution (lowest first) for minimal bandwidth
317
+ audio_formats.sort(key=lambda f: f.get("height") or f.get("width") or 9999)
318
+
319
+ if not audio_formats:
320
+ # Check if there are HLS-only formats (common for Shorts)
321
+ # HLS can still work with server-side streaming (same IP)
322
+ hls_with_audio = [f for f in formats if f.get("acodec") not in (None, "none") and f.get("url")]
323
+ if hls_with_audio:
324
+ logger.warning("Only HLS streams available. Returning HLS URL for server-side streaming.")
325
+ # Sort: prefer audio-only, then by resolution (lowest first)
326
+ hls_with_audio.sort(
327
+ key=lambda f: (
328
+ 0 if f.get("vcodec") in (None, "none") else 1,
329
+ f.get("height") or f.get("width") or 9999,
330
+ )
331
+ )
332
+ audio_formats = hls_with_audio
333
+ else:
334
+ raise YouTubeError(
335
+ "No formats with audio available. YouTube may require authentication for this video."
336
+ )
268
337
 
269
338
  # Filter by audio_track_id if specified (for multi-language audio)
270
339
  if audio_track_id:
@@ -314,17 +383,30 @@ class YoutubeLoader:
314
383
  audio_formats.sort(key=score_format, reverse=True)
315
384
  best = audio_formats[0]
316
385
 
386
+ # Check if selected format is HLS (requires server-side streaming)
387
+ best_url = best.get("url", "")
388
+ is_hls = not is_direct_url(best_url)
389
+
317
390
  return {
318
- "url": best.get("url"),
391
+ "url": best_url,
319
392
  "mime_type": best.get("ext", format_preference),
320
393
  "bitrate": best.get("abr") or best.get("tbr"),
394
+ "sample_rate": best.get("asr"), # Audio sample rate
321
395
  "content_length": best.get("filesize") or best.get("filesize_approx"),
322
396
  "format_id": best.get("format_id"),
323
397
  "ext": best.get("ext"),
398
+ "is_hls": is_hls, # True = use server streaming, False = use proxy
324
399
  }
325
400
 
326
401
  except yt_dlp.utils.DownloadError as e:
327
- raise YouTubeError(f"Failed to get audio URL: {str(e)}") from e
402
+ msg = str(e)
403
+ if "Sign in to confirm" in msg or "not a bot" in msg:
404
+ raise YouTubeError(
405
+ f"🤖 YouTube Bot Detection: Cookie configuration required to access this video. "
406
+ f"Reference: YoutubeLoader(cookies='chrome') or set environment variable YOUTUBE_COOKIE_BROWSER=chrome. "
407
+ f"Original error: {msg}"
408
+ ) from e
409
+ raise YouTubeError(f"Failed to get audio URL: {msg}") from e
328
410
  except Exception as e:
329
411
  raise YouTubeError(f"Unexpected error getting audio URL: {str(e)}") from e
330
412
 
@@ -346,18 +428,12 @@ class YoutubeLoader:
346
428
  """
347
429
  url = f"https://www.youtube.com/watch?v={video_id}"
348
430
 
349
- # Use default yt-dlp config to get all available formats
431
+ # Use base opts (includes proxy and cookie config) + DASH and HLS manifests
350
432
  opts = {
351
- "quiet": True,
352
- "no_warnings": True,
353
- "skip_download": True,
354
- "extract_flat": False,
433
+ **self._base_opts,
355
434
  "youtube_include_dash_manifest": True,
435
+ "youtube_include_hls_manifest": True,
356
436
  }
357
- if self.proxy:
358
- opts["proxy"] = self.proxy
359
- if self.cookies:
360
- opts["cookiefile"] = self.cookies
361
437
 
362
438
  try:
363
439
  with yt_dlp.YoutubeDL(opts) as ydl:
@@ -366,27 +442,34 @@ class YoutubeLoader:
366
442
  # Get all formats
367
443
  formats = info.get("formats", [])
368
444
 
445
+ def is_direct_url(url: str) -> bool:
446
+ """Check if URL is a direct stream URL (not HLS manifest)"""
447
+ if not url:
448
+ return False
449
+ hls_patterns = ["manifest.googlevideo.com", "/hls_playlist/", ".m3u8"]
450
+ return not any(p in url for p in hls_patterns)
451
+
369
452
  # Filter for video formats:
370
453
  # - Must have video codec
371
- # - Must have direct URL (not manifest/playlist)
372
- # - Exclude HLS/DASH manifests (protocol contains m3u8 or dash)
373
- def is_direct_video(f: Dict) -> bool:
454
+ # - Must have a URL
455
+ # - Prefer direct URLs (DASH) over HLS manifests
456
+ def is_usable_video(f: Dict) -> bool:
374
457
  if f.get("vcodec") in (None, "none"):
375
458
  return False
376
- url = f.get("url", "")
377
- protocol = f.get("protocol", "")
378
- # Exclude HLS manifests
379
- if "m3u8" in protocol or ".m3u8" in url or "manifest.googlevideo.com" in url:
380
- return False
381
- # Exclude DASH manifests
382
- if "dash" in protocol:
459
+ if not f.get("url"):
383
460
  return False
384
461
  return True
385
462
 
386
- video_formats = [f for f in formats if is_direct_video(f)]
463
+ # First try: direct URLs only (exclude HLS)
464
+ video_formats = [f for f in formats if is_usable_video(f) and is_direct_url(f.get("url", ""))]
387
465
 
466
+ # Fallback: include HLS if no direct formats
388
467
  if not video_formats:
389
- raise YouTubeError("No direct video formats available (only HLS/DASH manifests found)")
468
+ logger.warning("No direct video URLs found. Falling back to HLS formats.")
469
+ video_formats = [f for f in formats if is_usable_video(f)]
470
+
471
+ if not video_formats:
472
+ raise YouTubeError("No video formats available")
390
473
 
391
474
  # Parse target height from quality parameter
392
475
  target_height = None
@@ -413,15 +496,19 @@ class YoutubeLoader:
413
496
  video_formats.sort(key=score_format, reverse=True)
414
497
  best = video_formats[0]
415
498
 
499
+ # Check if selected format is HLS
500
+ best_url = best.get("url", "")
501
+ is_hls = not is_direct_url(best_url)
502
+
416
503
  # Log selection for debugging
417
504
  logger.info(
418
505
  f"Selected video format: {best.get('format_id')} "
419
506
  f"({best.get('width')}x{best.get('height')}, "
420
- f"vcodec={best.get('vcodec')}, acodec={best.get('acodec')})"
507
+ f"vcodec={best.get('vcodec')}, acodec={best.get('acodec')}, is_hls={is_hls})"
421
508
  )
422
509
 
423
510
  return {
424
- "url": best.get("url"),
511
+ "url": best_url,
425
512
  "mime_type": best.get("ext", format_preference),
426
513
  "width": best.get("width"),
427
514
  "height": best.get("height"),
@@ -432,10 +519,18 @@ class YoutubeLoader:
432
519
  "content_length": best.get("filesize") or best.get("filesize_approx"),
433
520
  "format_id": best.get("format_id"),
434
521
  "ext": best.get("ext"),
522
+ "is_hls": is_hls,
435
523
  }
436
524
 
437
525
  except yt_dlp.utils.DownloadError as e:
438
- raise YouTubeError(f"Failed to get video URL: {str(e)}") from e
526
+ msg = str(e)
527
+ if "Sign in to confirm" in msg or "not a bot" in msg:
528
+ raise YouTubeError(
529
+ f"🤖 YouTube Bot Detection: Cookie configuration required to access this video. "
530
+ f"Reference: YoutubeLoader(cookies='chrome') or set environment variable YOUTUBE_COOKIE_BROWSER=chrome. "
531
+ f"Original error: {msg}"
532
+ ) from e
533
+ raise YouTubeError(f"Failed to get video URL: {msg}") from e
439
534
  except Exception as e:
440
535
  raise YouTubeError(f"Unexpected error getting video URL: {str(e)}") from e
441
536
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lattifai
3
- Version: 1.2.2
3
+ Version: 1.3.0
4
4
  Summary: Lattifai Python SDK: Seamless Integration with Lattifai's Speech and Video AI Services
5
5
  Author-email: Lattifai Technologies <tech@lattifai.com>
6
6
  Maintainer-email: Lattice <tech@lattifai.com>
@@ -50,49 +50,40 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
50
50
  Requires-Python: <3.15,>=3.10
51
51
  Description-Content-Type: text/markdown
52
52
  License-File: LICENSE
53
- Requires-Dist: lattifai[core]
54
- Requires-Dist: lattifai[alignment]
55
- Requires-Dist: lattifai[transcription]
56
- Requires-Dist: lattifai[workflow]
57
- Requires-Dist: lattifai[server]
58
- Provides-Extra: core
59
- Requires-Dist: k2py>=0.2.1; extra == "core"
60
- Requires-Dist: lattifai-core>=0.6.0; extra == "core"
61
- Requires-Dist: lattifai-run>=1.0.1; extra == "core"
62
- Requires-Dist: python-dotenv; extra == "core"
63
- Requires-Dist: msgpack; extra == "core"
64
- Requires-Dist: scipy!=1.16.3; extra == "core"
65
- Requires-Dist: av; extra == "core"
66
- Provides-Extra: alignment
67
- Requires-Dist: lhotse>=1.26.0; extra == "alignment"
68
- Requires-Dist: colorful>=0.5.6; extra == "alignment"
69
- Requires-Dist: pysubs2; extra == "alignment"
70
- Requires-Dist: praatio; extra == "alignment"
71
- Requires-Dist: tgt; extra == "alignment"
72
- Requires-Dist: onnx>=1.16.0; extra == "alignment"
73
- Requires-Dist: onnxruntime; extra == "alignment"
74
- Requires-Dist: g2p-phonemizer>=0.4.0; extra == "alignment"
75
- Requires-Dist: wtpsplit>=2.1.7; extra == "alignment"
76
- Requires-Dist: modelscope>=1.33.0; extra == "alignment"
77
- Requires-Dist: error-align-fix>=0.1.4; extra == "alignment"
53
+ Requires-Dist: python-dotenv
54
+ Requires-Dist: colorful>=0.5.6
55
+ Requires-Dist: lattifai-run>=1.0.1
56
+ Requires-Dist: lattifai-captions[splitting]>=0.1.6
57
+ Requires-Dist: lattifai-core-hq>=0.6.4
58
+ Requires-Dist: g2p-phonemizer>=0.4.0
59
+ Requires-Dist: error-align-fix>=0.1.4
60
+ Requires-Dist: lhotse>=1.26.0
61
+ Requires-Dist: k2py==0.2.4
62
+ Requires-Dist: onnxruntime
63
+ Requires-Dist: av
64
+ Requires-Dist: msgpack
65
+ Provides-Extra: event
66
+ Requires-Dist: pyannote-audio-notorchdeps>=4.0.2; extra == "event"
67
+ Provides-Extra: diarization
68
+ Requires-Dist: nemo_toolkit_asr[asr]>=2.7.0rc4; extra == "diarization"
69
+ Requires-Dist: pyannote-audio-notorchdeps>=4.0.2; extra == "diarization"
78
70
  Provides-Extra: transcription
79
71
  Requires-Dist: OmniSenseVoice>=0.4.2; extra == "transcription"
80
72
  Requires-Dist: nemo_toolkit_asr[asr]>=2.7.0rc4; extra == "transcription"
81
73
  Requires-Dist: google-genai>=1.22.0; extra == "transcription"
82
74
  Requires-Dist: pyannote-audio-notorchdeps>=4.0.2; extra == "transcription"
83
- Provides-Extra: workflow
84
- Requires-Dist: questionary>=2.0; extra == "workflow"
85
- Requires-Dist: yt-dlp; extra == "workflow"
86
- Requires-Dist: pycryptodome; extra == "workflow"
87
- Provides-Extra: server
88
- Requires-Dist: fastapi>=0.111.0; extra == "server"
89
- Requires-Dist: uvicorn>=0.30.0; extra == "server"
90
- Requires-Dist: python-multipart>=0.0.9; extra == "server"
91
- Requires-Dist: jinja2>=3.1.4; extra == "server"
75
+ Provides-Extra: youtube
76
+ Requires-Dist: questionary>=2.0; extra == "youtube"
77
+ Requires-Dist: yt-dlp; extra == "youtube"
78
+ Requires-Dist: pycryptodome; extra == "youtube"
92
79
  Provides-Extra: dev
80
+ Requires-Dist: black; extra == "dev"
93
81
  Requires-Dist: pytest>=8.0.0; extra == "dev"
94
82
  Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
95
83
  Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
84
+ Provides-Extra: all
85
+ Requires-Dist: lattifai[transcription]; extra == "all"
86
+ Requires-Dist: lattifai[youtube]; extra == "all"
96
87
  Dynamic: license-file
97
88
 
98
89
  <div align="center">
@@ -104,13 +95,13 @@ Dynamic: license-file
104
95
  </div>
105
96
 
106
97
  <p align="center">
107
- 🌐 <a href="https://lattifai.com"><b>Official Website</b></a> &nbsp&nbsp | &nbsp&nbsp 🖥️ <a href="https://github.com/lattifai/lattifai-python">GitHub</a> &nbsp&nbsp | &nbsp&nbsp 🤗 <a href="https://huggingface.co/Lattifai/Lattice-1">Model</a> &nbsp&nbsp | &nbsp&nbsp 📑 <a href="https://lattifai.com/blogs">Blog</a> &nbsp&nbsp | &nbsp&nbsp <a href="https://discord.gg/kvF4WsBRK8"><img src="https://img.shields.io/badge/Discord-Join-5865F2?logo=discord&logoColor=white" alt="Discord" style="vertical-align: middle;"></a>
98
+ 🌐 <a href="https://lattifai.com"><b>Official Website</b></a> &nbsp;&nbsp; | &nbsp;&nbsp; 🖥️ <a href="https://github.com/lattifai/lattifai-python">GitHub</a> &nbsp;&nbsp; | &nbsp;&nbsp; 🤗 <a href="https://huggingface.co/LattifAI/Lattice-1">Model</a> &nbsp;&nbsp; | &nbsp;&nbsp; 📑 <a href="https://lattifai.com/blogs">Blog</a> &nbsp;&nbsp; | &nbsp;&nbsp; <a href="https://discord.gg/kvF4WsBRK8"><img src="https://img.shields.io/badge/Discord-Join-5865F2?logo=discord&logoColor=white" alt="Discord" style="vertical-align: middle;"></a>
108
99
  </p>
109
100
 
110
101
 
111
102
  # LattifAI: Precision Alignment, Infinite Possibilities
112
103
 
113
- Advanced forced alignment and subtitle generation powered by [ 🤗 Lattice-1](https://huggingface.co/Lattifai/Lattice-1) model.
104
+ Advanced forced alignment and subtitle generation powered by [ 🤗 Lattice-1](https://huggingface.co/LattifAI/Lattice-1) model.
114
105
 
115
106
  ## Table of Contents
116
107
 
@@ -120,6 +111,7 @@ Advanced forced alignment and subtitle generation powered by [ 🤗 Lattice-1](h
120
111
  - [CLI Reference](#cli-reference)
121
112
  - [Python SDK](#python-sdk)
122
113
  - [Advanced Features](#advanced-features)
114
+ - [Text Processing](#text-processing)
123
115
  - [Supported Formats & Languages](#supported-formats--languages)
124
116
  - [Roadmap](#roadmap)
125
117
  - [Development](#development)
@@ -130,7 +122,7 @@ Advanced forced alignment and subtitle generation powered by [ 🤗 Lattice-1](h
130
122
 
131
123
  | Feature | Description |
132
124
  |---------|-------------|
133
- | **Forced Alignment** | Word-level and segment-level audio-text synchronization powered by [Lattice-1](https://huggingface.co/Lattifai/Lattice-1) |
125
+ | **Forced Alignment** | Word-level and segment-level audio-text synchronization powered by [Lattice-1](https://huggingface.co/LattifAI/Lattice-1) |
134
126
  | **Multi-Model Transcription** | Gemini (100+ languages), Parakeet (24 languages), SenseVoice (5 languages) |
135
127
  | **Speaker Diarization** | Multi-speaker identification with label preservation |
136
128
  | **Streaming Mode** | Process audio up to 20 hours with minimal memory |
@@ -138,10 +130,10 @@ Advanced forced alignment and subtitle generation powered by [ 🤗 Lattice-1](h
138
130
 
139
131
  ### Alignment Models
140
132
 
141
- | Model | Languages | Description |
142
- |-------|-----------|-------------|
143
- | **Lattice-1** | English, Chinese, German | Production model with mixed-language alignment support |
144
- | **Lattice-1-Alpha** | English | Initial release with English forced alignment |
133
+ | Model | Links | Languages | Description |
134
+ |-------|-------|-----------|-------------|
135
+ | **Lattice-1** | [🤗 HF](https://huggingface.co/LattifAI/Lattice-1) • [🤖 MS](https://modelscope.cn/models/LattifAI/Lattice-1) | English, Chinese, German | Production model with mixed-language alignment support |
136
+ | **Lattice-1-Alpha** | [🤗 HF](https://huggingface.co/LattifAI/Lattice-1-Alpha) • [🤖 MS](https://modelscope.cn/models/LattifAI/Lattice-1-Alpha) | English | Initial release with English forced alignment |
145
137
 
146
138
  **Model Hub**: Models can be downloaded from `huggingface` (default) or `modelscope` (recommended for users in China):
147
139
 
@@ -151,7 +143,8 @@ lai alignment align audio.wav caption.srt output.srt alignment.model_hub=modelsc
151
143
  ```
152
144
 
153
145
  ```python
154
- from lattifai import LattifAI, AlignmentConfig
146
+ from lattifai.client import LattifAI
147
+ from lattifai.config import AlignmentConfig
155
148
 
156
149
  client = LattifAI(alignment_config=AlignmentConfig(model_hub="modelscope"))
157
150
  ```
@@ -173,16 +166,34 @@ uvx --from lattifai lai --help
173
166
 
174
167
  # Or create a project
175
168
  mkdir my-project && cd my-project
176
- uv init --bare && uv add lattifai
169
+ uv init --bare && uv add "lattifai[all]"
177
170
  uv run lai alignment align audio.wav caption.srt output.srt
178
171
  ```
179
172
 
180
173
  ### Using pip
181
174
 
182
175
  ```bash
183
- pip install lattifai
176
+ # Full installation (recommended)
177
+ pip install "lattifai[all]"
184
178
  ```
185
179
 
180
+ ### Installation Options
181
+
182
+ | Extra | Command | Includes |
183
+ |-------|---------|----------|
184
+ | (base) | `pip install lattifai` | Forced alignment (Lattice-1, k2py, ONNX, captions) |
185
+ | `all` | `pip install "lattifai[all]"` | Base + transcription + youtube |
186
+ | `transcription` | `pip install "lattifai[transcription]"` | ASR models (Gemini, Parakeet, SenseVoice) |
187
+ | `youtube` | `pip install "lattifai[youtube]"` | YouTube download (yt-dlp) |
188
+ | `diarization` | `pip install "lattifai[diarization]"` | Speaker diarization (NeMo, pyannote) |
189
+ | `event` | `pip install "lattifai[event]"` | Audio event detection |
190
+
191
+ **Note:** Base installation includes full alignment functionality. Use `[all]` for transcription and YouTube features.
192
+
193
+ ### Caption Format Support
194
+
195
+ Caption/subtitle format parsing is provided by [lattifai-captions](https://github.com/lattifai/captions), a separate package supporting 30+ formats (SRT, VTT, ASS, TTML, TextGrid, NLE formats, etc.). It is automatically installed with `lattifai[core]` or `lattifai[all]`.
196
+
186
197
  ### API Keys
187
198
 
188
199
  **LattifAI API Key (Required)** - Get your free key at [lattifai.com/dashboard/api-keys](https://lattifai.com/dashboard/api-keys)
@@ -220,7 +231,7 @@ lai alignment youtube "https://youtube.com/watch?v=VIDEO_ID"
220
231
  ### Python SDK
221
232
 
222
233
  ```python
223
- from lattifai import LattifAI
234
+ from lattifai.client import LattifAI
224
235
 
225
236
  client = LattifAI()
226
237
  caption = client.alignment(
@@ -319,8 +330,8 @@ lai transcribe align audio.wav output.srt \
319
330
  ### Configuration Objects
320
331
 
321
332
  ```python
322
- from lattifai import (
323
- LattifAI,
333
+ from lattifai.client import LattifAI
334
+ from lattifai.config import (
324
335
  ClientConfig,
325
336
  AlignmentConfig,
326
337
  CaptionConfig,
@@ -365,7 +376,8 @@ caption = client.youtube(
365
376
  | `include_speaker_in_text` | `True` | Include speaker labels in text output |
366
377
 
367
378
  ```python
368
- from lattifai import LattifAI, CaptionConfig
379
+ from lattifai.client import LattifAI
380
+ from lattifai.config import CaptionConfig
369
381
 
370
382
  client = LattifAI(
371
383
  caption_config=CaptionConfig(
@@ -396,6 +408,9 @@ caption = client.alignment(
396
408
  ### Word-Level Alignment
397
409
 
398
410
  ```python
411
+ from lattifai.client import LattifAI
412
+ from lattifai.config import CaptionConfig
413
+
399
414
  client = LattifAI(caption_config=CaptionConfig(word_level=True))
400
415
  caption = client.alignment(
401
416
  input_media="audio.wav",
@@ -420,7 +435,8 @@ Automatically identify and label different speakers in audio.
420
435
  - Gemini transcription → Names extracted from context (e.g., "Hi, I'm Alice" → `Alice`)
421
436
 
422
437
  ```python
423
- from lattifai import LattifAI, DiarizationConfig
438
+ from lattifai.client import LattifAI
439
+ from lattifai.config import DiarizationConfig
424
440
 
425
441
  client = LattifAI(
426
442
  diarization_config=DiarizationConfig(
@@ -453,6 +469,51 @@ Input Caption → Reader → Tokenizer
453
469
 
454
470
  ---
455
471
 
472
+ ## Text Processing
473
+
474
+ The tokenizer handles various text patterns for forced alignment.
475
+
476
+ ### Bracket/Caption Handling
477
+
478
+ Visual captions and annotations in brackets are treated specially - they get **two pronunciation paths** so the aligner can choose:
479
+ 1. **Silence path** - skip when content doesn't appear in audio
480
+ 2. **Inner text pronunciation** - match if someone actually says the words
481
+
482
+ | Bracket Type | Symbol | Example | Alignment Behavior |
483
+ |--------------|--------|---------|-------------------|
484
+ | Half-width square | `[]` | `[APPLAUSE]` | Skip or match "applause" |
485
+ | Half-width paren | `()` | `(music)` | Skip or match "music" |
486
+ | Full-width square | `【】` | `【笑声】` | Skip or match "笑声" |
487
+ | Full-width paren | `()` | `(音乐)` | Skip or match "音乐" |
488
+ | Angle brackets | `<>` | `<intro>` | Skip or match "intro" |
489
+ | Book title marks | `《》` | `《开场白》` | Skip or match "开场白" |
490
+
491
+ This allows proper handling of:
492
+ - **Visual descriptions**: `[Barret adjusts the camera and smiles]` → skipped if not spoken
493
+ - **Sound effects**: `[APPLAUSE]`, `(music)` → matched if audible
494
+ - **Chinese annotations**: `【笑声】`, `(鼓掌)` → flexible alignment
495
+
496
+ ### Multilingual Text
497
+
498
+ | Pattern | Handling | Example |
499
+ |---------|----------|---------|
500
+ | CJK characters | Split individually | `你好` → `["你", "好"]` |
501
+ | Latin words | Grouped with accents | `Kühlschrank` → `["Kühlschrank"]` |
502
+ | Contractions | Kept together | `I'm`, `don't`, `we'll` |
503
+ | Punctuation | Attached to words | `Hello,` `world!` |
504
+
505
+ ### Speaker Labels
506
+
507
+ Recognized speaker patterns are preserved during alignment:
508
+
509
+ | Format | Example | Output |
510
+ |--------|---------|--------|
511
+ | Arrow prefix | `>> Alice:` or `&gt;&gt; Alice:` | `[Alice]` |
512
+ | LattifAI format | `[SPEAKER_01]:` | `[SPEAKER_01]` |
513
+ | Uppercase name | `SPEAKER NAME:` | `[SPEAKER NAME]` |
514
+
515
+ ---
516
+
456
517
  ## Supported Formats & Languages
457
518
 
458
519
  ### Media Formats
@@ -461,7 +522,9 @@ Input Caption → Reader → Tokenizer
461
522
  |------|---------|
462
523
  | **Audio** | WAV, MP3, M4A, AAC, FLAC, OGG, OPUS, AIFF, and more |
463
524
  | **Video** | MP4, MKV, MOV, WEBM, AVI, and more |
464
- | **Caption** | SRT, VTT, ASS, SSA, JSON, TextGrid, TSV, CSV, LRC, TTML, and more |
525
+ | **Caption** | SRT, VTT, ASS, SSA, SRV3, JSON, TextGrid, TSV, CSV, LRC, TTML, and more |
526
+
527
+ > **Note**: Caption format handling is provided by [lattifai-captions](https://github.com/lattifai/captions), which is automatically installed as a dependency. For standalone caption processing without alignment features, install `pip install lattifai-captions`.
465
528
 
466
529
  ### JSON Format
467
530
 
@@ -515,8 +578,8 @@ WEBVTT
515
578
  **Writing**: Use `word_level=True` with `karaoke_config` to output YouTube VTT style:
516
579
 
517
580
  ```python
518
- from lattifai import Caption
519
- from lattifai.config.caption import KaraokeConfig
581
+ from lattifai.caption import Caption
582
+ from lattifai.caption.config import KaraokeConfig
520
583
 
521
584
  caption = Caption.read("input.vtt")
522
585
  caption.write(
@@ -584,7 +647,7 @@ cd lattifai-python
584
647
  uv sync && source .venv/bin/activate
585
648
 
586
649
  # Or pip
587
- pip install -e ".[test]"
650
+ pip install -e ".[all,dev]"
588
651
 
589
652
  # Run tests
590
653
  pytest