lattifai 0.4.6__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. lattifai/__init__.py +42 -27
  2. lattifai/alignment/__init__.py +6 -0
  3. lattifai/alignment/lattice1_aligner.py +119 -0
  4. lattifai/{workers/lattice1_alpha.py → alignment/lattice1_worker.py} +33 -132
  5. lattifai/{tokenizer → alignment}/phonemizer.py +1 -1
  6. lattifai/alignment/segmenter.py +166 -0
  7. lattifai/{tokenizer → alignment}/tokenizer.py +186 -112
  8. lattifai/audio2.py +211 -0
  9. lattifai/caption/__init__.py +20 -0
  10. lattifai/caption/caption.py +1275 -0
  11. lattifai/{io → caption}/supervision.py +1 -0
  12. lattifai/{io → caption}/text_parser.py +53 -10
  13. lattifai/cli/__init__.py +17 -0
  14. lattifai/cli/alignment.py +153 -0
  15. lattifai/cli/caption.py +204 -0
  16. lattifai/cli/server.py +19 -0
  17. lattifai/cli/transcribe.py +197 -0
  18. lattifai/cli/youtube.py +128 -0
  19. lattifai/client.py +455 -246
  20. lattifai/config/__init__.py +20 -0
  21. lattifai/config/alignment.py +73 -0
  22. lattifai/config/caption.py +178 -0
  23. lattifai/config/client.py +46 -0
  24. lattifai/config/diarization.py +67 -0
  25. lattifai/config/media.py +335 -0
  26. lattifai/config/transcription.py +84 -0
  27. lattifai/diarization/__init__.py +5 -0
  28. lattifai/diarization/lattifai.py +89 -0
  29. lattifai/errors.py +41 -34
  30. lattifai/logging.py +116 -0
  31. lattifai/mixin.py +552 -0
  32. lattifai/server/app.py +420 -0
  33. lattifai/transcription/__init__.py +76 -0
  34. lattifai/transcription/base.py +108 -0
  35. lattifai/transcription/gemini.py +219 -0
  36. lattifai/transcription/lattifai.py +103 -0
  37. lattifai/types.py +30 -0
  38. lattifai/utils.py +3 -31
  39. lattifai/workflow/__init__.py +22 -0
  40. lattifai/workflow/agents.py +6 -0
  41. lattifai/{workflows → workflow}/file_manager.py +81 -57
  42. lattifai/workflow/youtube.py +564 -0
  43. lattifai-1.0.0.dist-info/METADATA +736 -0
  44. lattifai-1.0.0.dist-info/RECORD +52 -0
  45. {lattifai-0.4.6.dist-info → lattifai-1.0.0.dist-info}/WHEEL +1 -1
  46. lattifai-1.0.0.dist-info/entry_points.txt +13 -0
  47. lattifai/base_client.py +0 -126
  48. lattifai/bin/__init__.py +0 -3
  49. lattifai/bin/agent.py +0 -324
  50. lattifai/bin/align.py +0 -295
  51. lattifai/bin/cli_base.py +0 -25
  52. lattifai/bin/subtitle.py +0 -210
  53. lattifai/io/__init__.py +0 -43
  54. lattifai/io/reader.py +0 -86
  55. lattifai/io/utils.py +0 -15
  56. lattifai/io/writer.py +0 -102
  57. lattifai/tokenizer/__init__.py +0 -3
  58. lattifai/workers/__init__.py +0 -3
  59. lattifai/workflows/__init__.py +0 -34
  60. lattifai/workflows/agents.py +0 -12
  61. lattifai/workflows/gemini.py +0 -167
  62. lattifai/workflows/prompts/README.md +0 -22
  63. lattifai/workflows/prompts/gemini/README.md +0 -24
  64. lattifai/workflows/prompts/gemini/transcription_gem.txt +0 -81
  65. lattifai/workflows/youtube.py +0 -931
  66. lattifai-0.4.6.dist-info/METADATA +0 -806
  67. lattifai-0.4.6.dist-info/RECORD +0 -39
  68. lattifai-0.4.6.dist-info/entry_points.txt +0 -3
  69. /lattifai/{io → caption}/gemini_reader.py +0 -0
  70. /lattifai/{io → caption}/gemini_writer.py +0 -0
  71. /lattifai/{workflows → transcription}/prompts/__init__.py +0 -0
  72. /lattifai/{workflows → workflow}/base.py +0 -0
  73. {lattifai-0.4.6.dist-info → lattifai-1.0.0.dist-info}/licenses/LICENSE +0 -0
  74. {lattifai-0.4.6.dist-info → lattifai-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,197 @@
1
+ """Transcription CLI entry point with nemo_run."""
2
+
3
+ from typing import Optional
4
+
5
+ import nemo_run as run
6
+ from lhotse.utils import Pathlike
7
+ from typing_extensions import Annotated
8
+
9
+ from lattifai.audio2 import AudioLoader, ChannelSelectorType
10
+ from lattifai.cli.alignment import align as alignment_align
11
+ from lattifai.config import (
12
+ AlignmentConfig,
13
+ CaptionConfig,
14
+ ClientConfig,
15
+ DiarizationConfig,
16
+ MediaConfig,
17
+ TranscriptionConfig,
18
+ )
19
+ from lattifai.utils import _resolve_model_path
20
+
21
+
22
+ @run.cli.entrypoint(name="run", namespace="transcribe")
23
+ def transcribe(
24
+ input: Optional[str] = None,
25
+ output_caption: Optional[str] = None,
26
+ output_dir: Optional[Pathlike] = None,
27
+ media_format: str = "mp3",
28
+ channel_selector: Optional[ChannelSelectorType] = "average",
29
+ transcription: Annotated[Optional[TranscriptionConfig], run.Config[TranscriptionConfig]] = None,
30
+ ):
31
+ """
32
+ Transcribe audio/video file or YouTube URL to caption.
33
+
34
+ This command performs automatic speech recognition (ASR) on audio/video files
35
+ or YouTube videos, generating timestamped transcriptions in various caption formats.
36
+
37
+ Shortcut: invoking ``lai-transcribe`` is equivalent to running ``lai transcribe run``.
38
+
39
+ Args:
40
+ input: Path to input audio/video file or YouTube URL (can be provided as positional argument)
41
+ output_caption: Path for output caption file (can be provided as positional argument)
42
+ output_dir: Directory for output files when using YouTube URL
43
+ media_format: Media format for YouTube downloads (default: mp3)
44
+ channel_selector: Audio channel selection strategy (default: average)
45
+ Options: average, left, right, or an integer channel index.
46
+ Note: Ignored when input is a URL and Gemini transcriber is used.
47
+ transcription: Transcription service configuration.
48
+ Fields: model_name, device, language, gemini_api_key
49
+
50
+ Examples:
51
+ # Transcribe local file with positional arguments
52
+ lai transcribe run audio.wav output.srt
53
+
54
+ # Transcribe YouTube video
55
+ lai transcribe run "https://www.youtube.com/watch?v=VIDEO_ID" ./output
56
+
57
+ # Using specific transcription model
58
+ lai transcribe run audio.mp4 output.ass \\
59
+ transcription.model_name=nvidia/parakeet-tdt-0.6b-v3
60
+
61
+ # Using Gemini transcription (requires API key)
62
+ lai transcribe run audio.wav output.srt \\
63
+ transcription.model_name=gemini-2.5-pro \\
64
+ transcription.gemini_api_key=YOUR_KEY
65
+
66
+ # Specify language for transcription
67
+ lai transcribe run audio.wav output.srt \\
68
+ transcription.language=zh
69
+
70
+ # Full configuration with keyword arguments
71
+ lai transcribe run \\
72
+ input=audio.wav \\
73
+ output_caption=output.srt \\
74
+ transcription.device=cuda \\
75
+ transcription.model_name=iic/SenseVoiceSmall
76
+ """
77
+ import asyncio
78
+ from pathlib import Path
79
+
80
+ import colorful
81
+
82
+ from lattifai.transcription import create_transcriber
83
+
84
+ # Initialize transcription config with defaults
85
+ transcription_config = transcription or TranscriptionConfig()
86
+
87
+ # Validate input is required
88
+ if not input:
89
+ raise ValueError("Input is required. Provide input as positional argument (file path or URL).")
90
+
91
+ # Detect if input is a URL
92
+ is_url = input.startswith(("http://", "https://"))
93
+
94
+ # Prepare output paths
95
+ if is_url:
96
+ # For URLs, use output_dir
97
+ if output_dir:
98
+ output_path = Path(str(output_dir)).expanduser()
99
+ output_path.mkdir(parents=True, exist_ok=True)
100
+ else:
101
+ output_path = Path.cwd()
102
+ else:
103
+ # For files, use input path directory
104
+ input_path = Path(str(input))
105
+ output_path = input_path.parent
106
+
107
+ # Create transcriber
108
+ if not transcription_config.lattice_model_path:
109
+ transcription_config.lattice_model_path = _resolve_model_path("Lattifai/Lattice-1")
110
+ transcriber = create_transcriber(transcription_config=transcription_config)
111
+
112
+ print(colorful.cyan(f"🎤 Starting transcription with {transcriber.name}..."))
113
+ print(colorful.cyan(f" Input: {input}"))
114
+
115
+ # Perform transcription
116
+ if is_url and transcriber.supports_url:
117
+ # Check if transcriber supports URL directly
118
+ print(colorful.cyan(" Transcribing from URL directly..."))
119
+ transcript = asyncio.run(transcriber.transcribe(input))
120
+ else:
121
+ if is_url:
122
+ # Download media first, then transcribe
123
+ print(colorful.cyan(" Downloading media from URL..."))
124
+ from lattifai.workflow.youtube import YouTubeDownloader
125
+
126
+ downloader = YouTubeDownloader()
127
+ input_path = asyncio.run(
128
+ downloader.download_media(
129
+ url=input,
130
+ output_dir=str(output_path),
131
+ media_format=media_format,
132
+ force_overwrite=False,
133
+ )
134
+ )
135
+ print(colorful.cyan(f" Media downloaded to: {input_path}"))
136
+ else:
137
+ input_path = Path(str(input))
138
+
139
+ print(colorful.cyan(" Loading audio..."))
140
+ # For files, load audio first
141
+ audio_loader = AudioLoader(device=transcription_config.device)
142
+ media_audio = audio_loader(input_path, channel_selector=channel_selector)
143
+ transcript = asyncio.run(transcriber.transcribe(media_audio))
144
+
145
+ # Determine output caption path
146
+ if output_caption:
147
+ final_output = Path(str(output_caption))
148
+ final_output.parent.mkdir(parents=True, exist_ok=True)
149
+ else:
150
+ if is_url:
151
+ # For URLs, generate output filename based on transcriber
152
+ output_format = transcriber.file_suffix.lstrip(".")
153
+ final_output = output_path / f"youtube_LattifAI_{transcriber.name}.{output_format}"
154
+ else:
155
+ # For files, use input filename with suffix
156
+ final_output = Path(str(input)).with_suffix(".LattifAI.srt")
157
+
158
+ print(colorful.cyan(f" Output: {final_output}"))
159
+
160
+ # Write output
161
+ transcriber.write(transcript, final_output, encoding="utf-8", cache_audio_events=False)
162
+
163
+ print(colorful.green(f"🎉 Transcription completed: {final_output}"))
164
+
165
+ return transcript
166
+
167
+
168
+ @run.cli.entrypoint(name="align", namespace="transcribe")
169
+ def transcribe_align(
170
+ input_media: Optional[str] = None,
171
+ output_caption: Optional[str] = None,
172
+ media: Annotated[Optional[MediaConfig], run.Config[MediaConfig]] = None,
173
+ caption: Annotated[Optional[CaptionConfig], run.Config[CaptionConfig]] = None,
174
+ client: Annotated[Optional[ClientConfig], run.Config[ClientConfig]] = None,
175
+ alignment: Annotated[Optional[AlignmentConfig], run.Config[AlignmentConfig]] = None,
176
+ transcription: Annotated[Optional[TranscriptionConfig], run.Config[TranscriptionConfig]] = None,
177
+ diarization: Annotated[Optional[DiarizationConfig], run.Config[DiarizationConfig]] = None,
178
+ ):
179
+ return alignment_align(
180
+ input_media=input_media,
181
+ output_caption=output_caption,
182
+ media=media,
183
+ caption=caption,
184
+ client=client,
185
+ alignment=alignment,
186
+ transcription=transcription,
187
+ diarization=diarization,
188
+ )
189
+
190
+
191
+ def main():
192
+ """Entry point for lai-transcribe command."""
193
+ run.cli.main(transcribe)
194
+
195
+
196
+ if __name__ == "__main__":
197
+ main()
@@ -0,0 +1,128 @@
1
+ """YouTube workflow CLI entry point with nemo_run."""
2
+
3
+ from typing import Optional
4
+
5
+ import nemo_run as run
6
+ from typing_extensions import Annotated
7
+
8
+ from lattifai.client import LattifAI
9
+ from lattifai.config import (
10
+ AlignmentConfig,
11
+ CaptionConfig,
12
+ ClientConfig,
13
+ DiarizationConfig,
14
+ MediaConfig,
15
+ TranscriptionConfig,
16
+ )
17
+
18
+
19
+ @run.cli.entrypoint(name="youtube", namespace="alignment")
20
+ def youtube(
21
+ yt_url: Optional[str] = None,
22
+ media: Annotated[Optional[MediaConfig], run.Config[MediaConfig]] = None,
23
+ client: Annotated[Optional[ClientConfig], run.Config[ClientConfig]] = None,
24
+ alignment: Annotated[Optional[AlignmentConfig], run.Config[AlignmentConfig]] = None,
25
+ caption: Annotated[Optional[CaptionConfig], run.Config[CaptionConfig]] = None,
26
+ transcription: Annotated[Optional[TranscriptionConfig], run.Config[TranscriptionConfig]] = None,
27
+ diarization: Annotated[Optional[DiarizationConfig], run.Config[DiarizationConfig]] = None,
28
+ ):
29
+ """
30
+ Download media from YouTube (when needed) and align captions.
31
+
32
+ This command provides a convenient workflow for aligning captions with YouTube videos.
33
+ It can automatically download media from YouTube URLs and optionally transcribe audio
34
+ using Gemini or download available captions from YouTube.
35
+
36
+ When a YouTube URL is provided:
37
+ 1. Downloads media in the specified format (audio or video)
38
+ 2. Optionally transcribes audio with Gemini OR downloads YouTube captions
39
+ 3. Performs forced alignment with the provided or generated captions
40
+
41
+ Shortcut: invoking ``lai-youtube`` is equivalent to running ``lai alignment youtube``.
42
+
43
+ Args:
44
+ yt_url: YouTube video URL (can be provided as positional argument)
45
+ media: Media configuration for controlling formats and output directories.
46
+ Fields: input_path (YouTube URL), output_dir, output_format, force_overwrite
47
+ client: API client configuration.
48
+ Fields: api_key, timeout, max_retries
49
+ alignment: Alignment configuration (model selection and inference settings).
50
+ Fields: model_name, device, batch_size
51
+ caption: Caption configuration for reading/writing caption files.
52
+ Fields: output_format, output_path, normalize_text,
53
+ split_sentence, word_level, encoding
54
+ transcription: Transcription service configuration (enables Gemini transcription).
55
+ Fields: gemini_api_key, model_name, language, device
56
+ diarization: Speaker diarization configuration.
57
+ Fields: enabled, num_speakers, min_speakers, max_speakers, device
58
+
59
+ Examples:
60
+ # Download from YouTube and align (positional argument)
61
+ lai alignment youtube "https://www.youtube.com/watch?v=VIDEO_ID"
62
+
63
+ # With custom output directory and format
64
+ lai alignment youtube "https://www.youtube.com/watch?v=VIDEO_ID" \\
65
+ media.output_dir=/tmp/youtube \\
66
+ media.output_format=mp3
67
+
68
+ # Full configuration with smart splitting and word-level alignment
69
+ lai alignment youtube "https://www.youtube.com/watch?v=VIDEO_ID" \\
70
+ caption.output_path=aligned.srt \\
71
+ caption.split_sentence=true \\
72
+ caption.word_level=true \\
73
+ alignment.device=cuda
74
+
75
+ # Use Gemini transcription (requires API key)
76
+ lai alignment youtube "https://www.youtube.com/watch?v=VIDEO_ID" \\
77
+ transcription.gemini_api_key=YOUR_KEY \\
78
+ transcription.model_name=gemini-2.0-flash
79
+
80
+ # Using keyword argument (traditional syntax)
81
+ lai alignment youtube \\
82
+ yt_url="https://www.youtube.com/watch?v=VIDEO_ID" \\
83
+ alignment.device=mps
84
+ """
85
+ # Initialize configs with defaults
86
+ media_config = media or MediaConfig()
87
+ caption_config = caption or CaptionConfig()
88
+
89
+ # Validate URL input: require exactly one of yt_url or media.input_path
90
+ if yt_url and media_config.input_path:
91
+ raise ValueError(
92
+ "Cannot specify both positional yt_url and media.input_path. "
93
+ "Use either positional argument or config, not both."
94
+ )
95
+
96
+ if not yt_url and not media_config.input_path:
97
+ raise ValueError("YouTube URL is required. Provide either positional yt_url or media.input_path parameter.")
98
+
99
+ # Assign yt_url to media_config.input_path if provided
100
+ if yt_url:
101
+ media_config.set_input_path(yt_url)
102
+
103
+ # Create LattifAI client with all configurations
104
+ lattifai_client = LattifAI(
105
+ client_config=client,
106
+ alignment_config=alignment,
107
+ caption_config=caption_config,
108
+ transcription_config=transcription,
109
+ diarization_config=diarization,
110
+ )
111
+ # Call the client's youtube method
112
+ return lattifai_client.youtube(
113
+ url=media_config.input_path,
114
+ output_dir=media_config.output_dir,
115
+ output_caption_path=caption_config.output_path,
116
+ media_format=media_config.normalize_format() if media_config.output_format else None,
117
+ force_overwrite=media_config.force_overwrite,
118
+ split_sentence=caption_config.split_sentence,
119
+ channel_selector=media_config.channel_selector,
120
+ )
121
+
122
+
123
+ def main():
124
+ run.cli.main(youtube)
125
+
126
+
127
+ if __name__ == "__main__":
128
+ main()