lattifai 0.4.6__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/__init__.py +42 -27
- lattifai/alignment/__init__.py +6 -0
- lattifai/alignment/lattice1_aligner.py +119 -0
- lattifai/{workers/lattice1_alpha.py → alignment/lattice1_worker.py} +33 -132
- lattifai/{tokenizer → alignment}/phonemizer.py +1 -1
- lattifai/alignment/segmenter.py +166 -0
- lattifai/{tokenizer → alignment}/tokenizer.py +186 -112
- lattifai/audio2.py +211 -0
- lattifai/caption/__init__.py +20 -0
- lattifai/caption/caption.py +1275 -0
- lattifai/{io → caption}/supervision.py +1 -0
- lattifai/{io → caption}/text_parser.py +53 -10
- lattifai/cli/__init__.py +17 -0
- lattifai/cli/alignment.py +153 -0
- lattifai/cli/caption.py +204 -0
- lattifai/cli/server.py +19 -0
- lattifai/cli/transcribe.py +197 -0
- lattifai/cli/youtube.py +128 -0
- lattifai/client.py +455 -246
- lattifai/config/__init__.py +20 -0
- lattifai/config/alignment.py +73 -0
- lattifai/config/caption.py +178 -0
- lattifai/config/client.py +46 -0
- lattifai/config/diarization.py +67 -0
- lattifai/config/media.py +335 -0
- lattifai/config/transcription.py +84 -0
- lattifai/diarization/__init__.py +5 -0
- lattifai/diarization/lattifai.py +89 -0
- lattifai/errors.py +41 -34
- lattifai/logging.py +116 -0
- lattifai/mixin.py +552 -0
- lattifai/server/app.py +420 -0
- lattifai/transcription/__init__.py +76 -0
- lattifai/transcription/base.py +108 -0
- lattifai/transcription/gemini.py +219 -0
- lattifai/transcription/lattifai.py +103 -0
- lattifai/types.py +30 -0
- lattifai/utils.py +3 -31
- lattifai/workflow/__init__.py +22 -0
- lattifai/workflow/agents.py +6 -0
- lattifai/{workflows → workflow}/file_manager.py +81 -57
- lattifai/workflow/youtube.py +564 -0
- lattifai-1.0.0.dist-info/METADATA +736 -0
- lattifai-1.0.0.dist-info/RECORD +52 -0
- {lattifai-0.4.6.dist-info → lattifai-1.0.0.dist-info}/WHEEL +1 -1
- lattifai-1.0.0.dist-info/entry_points.txt +13 -0
- lattifai/base_client.py +0 -126
- lattifai/bin/__init__.py +0 -3
- lattifai/bin/agent.py +0 -324
- lattifai/bin/align.py +0 -295
- lattifai/bin/cli_base.py +0 -25
- lattifai/bin/subtitle.py +0 -210
- lattifai/io/__init__.py +0 -43
- lattifai/io/reader.py +0 -86
- lattifai/io/utils.py +0 -15
- lattifai/io/writer.py +0 -102
- lattifai/tokenizer/__init__.py +0 -3
- lattifai/workers/__init__.py +0 -3
- lattifai/workflows/__init__.py +0 -34
- lattifai/workflows/agents.py +0 -12
- lattifai/workflows/gemini.py +0 -167
- lattifai/workflows/prompts/README.md +0 -22
- lattifai/workflows/prompts/gemini/README.md +0 -24
- lattifai/workflows/prompts/gemini/transcription_gem.txt +0 -81
- lattifai/workflows/youtube.py +0 -931
- lattifai-0.4.6.dist-info/METADATA +0 -806
- lattifai-0.4.6.dist-info/RECORD +0 -39
- lattifai-0.4.6.dist-info/entry_points.txt +0 -3
- /lattifai/{io → caption}/gemini_reader.py +0 -0
- /lattifai/{io → caption}/gemini_writer.py +0 -0
- /lattifai/{workflows → transcription}/prompts/__init__.py +0 -0
- /lattifai/{workflows → workflow}/base.py +0 -0
- {lattifai-0.4.6.dist-info → lattifai-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {lattifai-0.4.6.dist-info → lattifai-1.0.0.dist-info}/top_level.txt +0 -0
lattifai/config/media.py
ADDED
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
"""Media I/O configuration for LattifAI."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field, replace
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
from urllib.parse import urlparse
|
|
7
|
+
|
|
8
|
+
from lhotse.utils import Pathlike
|
|
9
|
+
|
|
10
|
+
# Supported media formats for both audio and video content
|
|
11
|
+
AUDIO_FORMATS = (
|
|
12
|
+
"aac",
|
|
13
|
+
"aiff",
|
|
14
|
+
"alac",
|
|
15
|
+
"flac",
|
|
16
|
+
"m4a",
|
|
17
|
+
"mp3",
|
|
18
|
+
"ogg",
|
|
19
|
+
"opus",
|
|
20
|
+
"wav",
|
|
21
|
+
"wma",
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
VIDEO_FORMATS = (
|
|
25
|
+
"3gp",
|
|
26
|
+
"avi",
|
|
27
|
+
"flv",
|
|
28
|
+
"m4v",
|
|
29
|
+
"mkv",
|
|
30
|
+
"mov",
|
|
31
|
+
"mp4",
|
|
32
|
+
"mpeg",
|
|
33
|
+
"mpg",
|
|
34
|
+
"webm",
|
|
35
|
+
"wmv",
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
MEDIA_FORMATS = tuple(sorted(set(AUDIO_FORMATS + VIDEO_FORMATS)))
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class MediaConfig:
|
|
43
|
+
"""Unified configuration for audio/video input and output handling."""
|
|
44
|
+
|
|
45
|
+
# Input configuration (local filesystem path or URL)
|
|
46
|
+
input_path: Optional[str] = None
|
|
47
|
+
"""Local file path or URL to audio/video content."""
|
|
48
|
+
|
|
49
|
+
media_format: str = "auto"
|
|
50
|
+
"""Media format (mp3, wav, mp4, etc.) or 'auto' for automatic detection."""
|
|
51
|
+
|
|
52
|
+
sample_rate: Optional[int] = None
|
|
53
|
+
"""Audio sample rate in Hz (e.g., 16000, 44100)."""
|
|
54
|
+
|
|
55
|
+
channels: Optional[int] = None
|
|
56
|
+
"""Number of audio channels (1=mono, 2=stereo)."""
|
|
57
|
+
|
|
58
|
+
channel_selector: Optional[str | int] = "average"
|
|
59
|
+
"""Audio channel selection strategy: 'average', 'left', 'right', or channel index."""
|
|
60
|
+
|
|
61
|
+
# Output / download configuration
|
|
62
|
+
output_dir: Path = field(default_factory=lambda: Path.cwd())
|
|
63
|
+
"""Directory for output files (default: current working directory)."""
|
|
64
|
+
|
|
65
|
+
output_path: Optional[str] = None
|
|
66
|
+
"""Full path for output file (overrides output_dir + filename)."""
|
|
67
|
+
|
|
68
|
+
output_format: Optional[str] = None
|
|
69
|
+
"""Output media format (mp3, wav, mp4, etc.)."""
|
|
70
|
+
|
|
71
|
+
prefer_audio: bool = True
|
|
72
|
+
"""Prefer audio format when 'auto' is specified."""
|
|
73
|
+
|
|
74
|
+
default_audio_format: str = "mp3"
|
|
75
|
+
"""Default audio format when no format is specified."""
|
|
76
|
+
|
|
77
|
+
default_video_format: str = "mp4"
|
|
78
|
+
"""Default video format when no format is specified."""
|
|
79
|
+
|
|
80
|
+
force_overwrite: bool = False
|
|
81
|
+
"""Overwrite existing output files without prompting."""
|
|
82
|
+
|
|
83
|
+
def __post_init__(self) -> None:
|
|
84
|
+
"""Validate configuration and normalize paths/formats."""
|
|
85
|
+
self._setup_output_directory()
|
|
86
|
+
self._validate_default_formats()
|
|
87
|
+
self._normalize_media_format()
|
|
88
|
+
self._process_input_path()
|
|
89
|
+
self._process_output_path()
|
|
90
|
+
|
|
91
|
+
def _setup_output_directory(self) -> None:
|
|
92
|
+
"""Ensure output directory exists and is valid."""
|
|
93
|
+
resolved_output_dir = self._ensure_dir(self.output_dir)
|
|
94
|
+
self.output_dir = resolved_output_dir
|
|
95
|
+
|
|
96
|
+
def _validate_default_formats(self) -> None:
|
|
97
|
+
"""Validate default audio and video formats."""
|
|
98
|
+
self.default_audio_format = self._normalize_format(self.default_audio_format)
|
|
99
|
+
self.default_video_format = self._normalize_format(self.default_video_format)
|
|
100
|
+
|
|
101
|
+
def _normalize_media_format(self) -> None:
|
|
102
|
+
"""Normalize media format, allowing 'auto' during initialization."""
|
|
103
|
+
self.media_format = self._normalize_format(self.media_format, allow_auto=True)
|
|
104
|
+
|
|
105
|
+
def _process_input_path(self) -> None:
|
|
106
|
+
"""Process and validate input path if provided."""
|
|
107
|
+
if self.input_path is None:
|
|
108
|
+
return
|
|
109
|
+
|
|
110
|
+
if self._is_url(self.input_path):
|
|
111
|
+
normalized_url = self._normalize_url(self.input_path)
|
|
112
|
+
self.input_path = normalized_url
|
|
113
|
+
if self.media_format == "auto":
|
|
114
|
+
inferred_format = self._infer_format_from_source(normalized_url)
|
|
115
|
+
if inferred_format:
|
|
116
|
+
self.media_format = self._normalize_format(inferred_format)
|
|
117
|
+
else:
|
|
118
|
+
# For local paths, normalize to string without validation here
|
|
119
|
+
# Validation will be done in check_input_sanity()
|
|
120
|
+
self.input_path = str(Path(self.input_path).expanduser())
|
|
121
|
+
if self.media_format == "auto":
|
|
122
|
+
inferred_format = Path(self.input_path).suffix.lstrip(".").lower()
|
|
123
|
+
if inferred_format:
|
|
124
|
+
self.media_format = self._normalize_format(inferred_format)
|
|
125
|
+
|
|
126
|
+
# Validate input after setting
|
|
127
|
+
self.check_input_sanity()
|
|
128
|
+
|
|
129
|
+
def _process_output_path(self) -> None:
|
|
130
|
+
"""Process output path and format."""
|
|
131
|
+
if self.output_path is not None:
|
|
132
|
+
self.set_output_path(self.output_path)
|
|
133
|
+
elif self.output_format is not None:
|
|
134
|
+
self.output_format = self._normalize_format(self.output_format)
|
|
135
|
+
else:
|
|
136
|
+
self.output_format = None
|
|
137
|
+
|
|
138
|
+
# ------------------------------------------------------------------
|
|
139
|
+
# Public helpers
|
|
140
|
+
# ------------------------------------------------------------------
|
|
141
|
+
def clone(self, **updates: object) -> "MediaConfig":
|
|
142
|
+
"""Return a shallow copy of the config with optional overrides."""
|
|
143
|
+
return replace(self, **updates)
|
|
144
|
+
|
|
145
|
+
def normalize_format(self, media_format: Optional[str] = None, *, prefer_audio: Optional[bool] = None) -> str:
|
|
146
|
+
"""Resolve a media format (handling the special "auto" value)."""
|
|
147
|
+
prefer_audio = self.prefer_audio if prefer_audio is None else prefer_audio
|
|
148
|
+
candidate = (media_format or self.media_format or "auto").lower()
|
|
149
|
+
if candidate == "auto":
|
|
150
|
+
candidate = self.default_audio_format if prefer_audio else self.default_video_format
|
|
151
|
+
return self._normalize_format(candidate)
|
|
152
|
+
|
|
153
|
+
def is_audio_format(self, media_format: Optional[str] = None) -> bool:
|
|
154
|
+
"""Check whether the provided (or effective) format is an audio format."""
|
|
155
|
+
return self.normalize_format(media_format) in AUDIO_FORMATS
|
|
156
|
+
|
|
157
|
+
def is_video_format(self, media_format: Optional[str] = None) -> bool:
|
|
158
|
+
"""Check whether the provided (or effective) format is a video format."""
|
|
159
|
+
return self.normalize_format(media_format) in VIDEO_FORMATS
|
|
160
|
+
|
|
161
|
+
def set_media_format(self, media_format: Optional[str], *, prefer_audio: Optional[bool] = None) -> str:
|
|
162
|
+
"""Update media_format and return the normalized value."""
|
|
163
|
+
normalized = self.normalize_format(media_format, prefer_audio=prefer_audio)
|
|
164
|
+
self.media_format = normalized
|
|
165
|
+
return normalized
|
|
166
|
+
|
|
167
|
+
def set_input_path(self, path: Pathlike) -> Path | str:
|
|
168
|
+
"""Update the input path (local path or URL) and infer format if possible."""
|
|
169
|
+
path = str(path)
|
|
170
|
+
if self._is_url(path):
|
|
171
|
+
normalized_url = self._normalize_url(path)
|
|
172
|
+
self.input_path = normalized_url
|
|
173
|
+
inferred_format = self._infer_format_from_source(normalized_url)
|
|
174
|
+
if inferred_format:
|
|
175
|
+
self.media_format = self._normalize_format(inferred_format)
|
|
176
|
+
self.check_input_sanity()
|
|
177
|
+
return normalized_url
|
|
178
|
+
|
|
179
|
+
resolved = self._ensure_file(path)
|
|
180
|
+
self.input_path = str(resolved)
|
|
181
|
+
inferred_format = resolved.suffix.lstrip(".").lower()
|
|
182
|
+
if inferred_format:
|
|
183
|
+
self.media_format = self._normalize_format(inferred_format)
|
|
184
|
+
self.check_input_sanity()
|
|
185
|
+
return resolved
|
|
186
|
+
|
|
187
|
+
def set_output_dir(self, output_dir: Pathlike) -> Path:
|
|
188
|
+
"""Update the output directory (creating it if needed)."""
|
|
189
|
+
resolved = self._ensure_dir(output_dir)
|
|
190
|
+
self.output_dir = resolved
|
|
191
|
+
return resolved
|
|
192
|
+
|
|
193
|
+
def set_output_path(self, output_path: Pathlike) -> Path:
|
|
194
|
+
"""Update the output path and synchronize output format and directory."""
|
|
195
|
+
resolved = self._ensure_file(output_path, must_exist=False, create_parent=True)
|
|
196
|
+
if not resolved.suffix:
|
|
197
|
+
raise ValueError("output_path must include a filename with an extension.")
|
|
198
|
+
fmt = resolved.suffix.lstrip(".").lower()
|
|
199
|
+
self.output_path = str(resolved)
|
|
200
|
+
self.output_dir = resolved.parent
|
|
201
|
+
self.output_format = self._normalize_format(fmt)
|
|
202
|
+
return resolved
|
|
203
|
+
|
|
204
|
+
def prepare_output_path(self, stem: Optional[str] = None, format: Optional[str] = None) -> Path:
|
|
205
|
+
"""Return an output path, creating one if not set yet."""
|
|
206
|
+
if self.output_path:
|
|
207
|
+
return Path(self.output_path)
|
|
208
|
+
|
|
209
|
+
effective_format = self.normalize_format(format or self.output_format or self.media_format)
|
|
210
|
+
base_name = stem or (self._derive_input_stem() or "output")
|
|
211
|
+
candidate = self.output_dir / f"{base_name}.{effective_format}"
|
|
212
|
+
self.output_path = str(candidate)
|
|
213
|
+
self.output_format = effective_format
|
|
214
|
+
return candidate
|
|
215
|
+
|
|
216
|
+
def is_input_remote(self) -> bool:
|
|
217
|
+
"""Return True if the configured input is a URL."""
|
|
218
|
+
return bool(self.input_path and self._is_url(self.input_path))
|
|
219
|
+
|
|
220
|
+
def check_input_sanity(self) -> None:
|
|
221
|
+
"""
|
|
222
|
+
Validate that input_path is properly configured and accessible.
|
|
223
|
+
|
|
224
|
+
Raises:
|
|
225
|
+
ValueError: If input_path is not set or is invalid.
|
|
226
|
+
FileNotFoundError: If input_path is a local file that does not exist.
|
|
227
|
+
"""
|
|
228
|
+
if not self.input_path:
|
|
229
|
+
raise ValueError("input_path is required but not set in MediaConfig")
|
|
230
|
+
|
|
231
|
+
if self._is_url(self.input_path):
|
|
232
|
+
# For URLs, validate that it's properly formatted
|
|
233
|
+
try:
|
|
234
|
+
parsed = urlparse(self.input_path)
|
|
235
|
+
if not parsed.scheme or not parsed.netloc:
|
|
236
|
+
raise ValueError(
|
|
237
|
+
f"Invalid URL format for input_path: '{self.input_path}'. "
|
|
238
|
+
"URL must include scheme (http/https) and domain."
|
|
239
|
+
)
|
|
240
|
+
except (ValueError, AttributeError) as e:
|
|
241
|
+
# ValueError: Invalid URL format
|
|
242
|
+
# AttributeError: urlparse issues with malformed input
|
|
243
|
+
raise ValueError(f"Failed to parse input_path as URL: {e}") from e
|
|
244
|
+
else:
|
|
245
|
+
# For local files, validate that the file exists and is accessible
|
|
246
|
+
input_file = Path(self.input_path).expanduser()
|
|
247
|
+
if not input_file.exists():
|
|
248
|
+
raise FileNotFoundError(
|
|
249
|
+
f"Input media file does not exist: '{input_file}'. " "Please check the path and try again."
|
|
250
|
+
)
|
|
251
|
+
if not input_file.is_file():
|
|
252
|
+
raise ValueError(
|
|
253
|
+
f"Input media path is not a file: '{input_file}'. " "Expected a valid media file path."
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
# ------------------------------------------------------------------
|
|
257
|
+
# Internal utilities
|
|
258
|
+
# ------------------------------------------------------------------
|
|
259
|
+
def _ensure_dir(self, directory: Pathlike) -> Path:
|
|
260
|
+
path = Path(directory).expanduser()
|
|
261
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
262
|
+
if not path.is_dir():
|
|
263
|
+
raise NotADirectoryError(f"Output directory '{path}' is not a directory.")
|
|
264
|
+
return path
|
|
265
|
+
|
|
266
|
+
def _ensure_file(self, path: Pathlike, *, must_exist: bool = True, create_parent: bool = False) -> Path:
|
|
267
|
+
file_path = Path(path).expanduser()
|
|
268
|
+
if must_exist:
|
|
269
|
+
if not file_path.exists():
|
|
270
|
+
raise FileNotFoundError(f"Input media path '{file_path}' does not exist.")
|
|
271
|
+
if not file_path.is_file():
|
|
272
|
+
raise ValueError(f"Input media path '{file_path}' is not a file.")
|
|
273
|
+
else:
|
|
274
|
+
if create_parent:
|
|
275
|
+
file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
276
|
+
return file_path
|
|
277
|
+
|
|
278
|
+
def _normalize_format(self, media_format: Optional[str], *, allow_auto: bool = False) -> str:
|
|
279
|
+
if media_format is None:
|
|
280
|
+
raise ValueError("media_format cannot be None")
|
|
281
|
+
normalized = media_format.strip().lower()
|
|
282
|
+
if not normalized:
|
|
283
|
+
raise ValueError("media_format cannot be empty")
|
|
284
|
+
if normalized == "auto":
|
|
285
|
+
if allow_auto:
|
|
286
|
+
return normalized
|
|
287
|
+
normalized = self.default_audio_format if self.prefer_audio else self.default_video_format
|
|
288
|
+
if normalized not in MEDIA_FORMATS:
|
|
289
|
+
raise ValueError(
|
|
290
|
+
"Unsupported media format '{fmt}'. Supported formats: {supported}".format(
|
|
291
|
+
fmt=media_format,
|
|
292
|
+
supported=", ".join(MEDIA_FORMATS),
|
|
293
|
+
)
|
|
294
|
+
)
|
|
295
|
+
return normalized
|
|
296
|
+
|
|
297
|
+
def _clean_url_escapes(self, url: str) -> str:
|
|
298
|
+
"""Remove shell escape backslashes from URL special characters."""
|
|
299
|
+
return url.strip().replace(r"\?", "?").replace(r"\=", "=").replace(r"\&", "&")
|
|
300
|
+
|
|
301
|
+
def _is_url(self, value: Pathlike) -> bool:
|
|
302
|
+
if not isinstance(value, str):
|
|
303
|
+
return False
|
|
304
|
+
cleaned = self._clean_url_escapes(value)
|
|
305
|
+
parsed = urlparse(cleaned)
|
|
306
|
+
return bool(parsed.scheme and parsed.netloc)
|
|
307
|
+
|
|
308
|
+
def _normalize_url(self, url: str) -> str:
|
|
309
|
+
cleaned = self._clean_url_escapes(url)
|
|
310
|
+
parsed = urlparse(cleaned)
|
|
311
|
+
if not parsed.scheme or not parsed.netloc:
|
|
312
|
+
raise ValueError("input_path must be an absolute URL when provided as a remote source.")
|
|
313
|
+
return cleaned
|
|
314
|
+
|
|
315
|
+
def _infer_format_from_source(self, source: str) -> Optional[str]:
|
|
316
|
+
path_segment = Path(urlparse(source).path) if self._is_url(source) else Path(source)
|
|
317
|
+
suffix = path_segment.suffix.lstrip(".").lower()
|
|
318
|
+
return suffix or None
|
|
319
|
+
|
|
320
|
+
def _derive_input_stem(self) -> Optional[str]:
|
|
321
|
+
if not self.input_path:
|
|
322
|
+
return None
|
|
323
|
+
if self.is_input_remote():
|
|
324
|
+
path_segment = Path(urlparse(self.input_path).path)
|
|
325
|
+
stem = path_segment.stem
|
|
326
|
+
return stem or None
|
|
327
|
+
return Path(self.input_path).stem or None
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
__all__ = [
|
|
331
|
+
"MediaConfig",
|
|
332
|
+
"AUDIO_FORMATS",
|
|
333
|
+
"VIDEO_FORMATS",
|
|
334
|
+
"MEDIA_FORMATS",
|
|
335
|
+
]
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""Transcription service configuration for LattifAI."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from typing import TYPE_CHECKING, Literal, Optional
|
|
6
|
+
|
|
7
|
+
from ..utils import _select_device
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from ..base_client import SyncAPIClient
|
|
11
|
+
|
|
12
|
+
SUPPORTED_TRANSCRIPTION_MODELS = Literal[
|
|
13
|
+
"gemini-2.5-pro",
|
|
14
|
+
"gemini-3-pro-preview",
|
|
15
|
+
"nvidia/parakeet-tdt-0.6b-v3",
|
|
16
|
+
"nvidia/canary-1b-v2",
|
|
17
|
+
"iic/SenseVoiceSmall",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class TranscriptionConfig:
|
|
23
|
+
"""
|
|
24
|
+
Transcription service configuration.
|
|
25
|
+
|
|
26
|
+
Settings for audio/video transcription using various providers.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
model_name: SUPPORTED_TRANSCRIPTION_MODELS = "nvidia/parakeet-tdt-0.6b-v3"
|
|
30
|
+
"""Model name for transcription."""
|
|
31
|
+
|
|
32
|
+
gemini_api_key: Optional[str] = None
|
|
33
|
+
"""Gemini API key. If None, reads from GEMINI_API_KEY environment variable."""
|
|
34
|
+
|
|
35
|
+
device: Literal["cpu", "cuda", "mps", "auto"] = "auto"
|
|
36
|
+
"""Computation device for transcription models."""
|
|
37
|
+
|
|
38
|
+
max_retries: int = 0
|
|
39
|
+
"""Maximum number of retry attempts for failed transcription requests."""
|
|
40
|
+
|
|
41
|
+
force_overwrite: bool = False
|
|
42
|
+
"""Force overwrite existing transcription files."""
|
|
43
|
+
|
|
44
|
+
verbose: bool = False
|
|
45
|
+
"""Enable debug logging for transcription operations."""
|
|
46
|
+
|
|
47
|
+
language: Optional[str] = None
|
|
48
|
+
"""Target language code for transcription (e.g., 'en', 'zh', 'ja')."""
|
|
49
|
+
|
|
50
|
+
lattice_model_path: Optional[str] = None
|
|
51
|
+
"""Path to local LattifAI model. Will be auto-set in LattifAI client."""
|
|
52
|
+
|
|
53
|
+
client_wrapper: Optional["SyncAPIClient"] = field(default=None, repr=False)
|
|
54
|
+
"""Reference to the SyncAPIClient instance. Auto-set during client initialization."""
|
|
55
|
+
|
|
56
|
+
def __post_init__(self):
|
|
57
|
+
"""Validate and auto-populate configuration after initialization."""
|
|
58
|
+
|
|
59
|
+
if self.model_name not in SUPPORTED_TRANSCRIPTION_MODELS.__args__:
|
|
60
|
+
raise ValueError(
|
|
61
|
+
f"Unsupported model_name: '{self.model_name}'. "
|
|
62
|
+
f"Supported models are: {SUPPORTED_TRANSCRIPTION_MODELS.__args__}"
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
# Load environment variables from .env file
|
|
66
|
+
from dotenv import find_dotenv, load_dotenv
|
|
67
|
+
|
|
68
|
+
# Try to find and load .env file from current directory or parent directories
|
|
69
|
+
load_dotenv(find_dotenv(usecwd=True))
|
|
70
|
+
|
|
71
|
+
# Auto-load Gemini API key from environment if not provided
|
|
72
|
+
if self.gemini_api_key is None:
|
|
73
|
+
self.gemini_api_key = os.environ.get("GEMINI_API_KEY")
|
|
74
|
+
|
|
75
|
+
# Validate max_retries
|
|
76
|
+
if self.max_retries < 0:
|
|
77
|
+
raise ValueError("max_retries must be non-negative")
|
|
78
|
+
|
|
79
|
+
# Validate device
|
|
80
|
+
if self.device not in ("cpu", "cuda", "mps", "auto") and not self.device.startswith("cuda:"):
|
|
81
|
+
raise ValueError(f"device must be one of ('cpu', 'cuda', 'mps', 'auto'), got '{self.device}'")
|
|
82
|
+
|
|
83
|
+
if self.device == "auto":
|
|
84
|
+
self.device = _select_device(self.device)
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""LattifAI speaker diarization implementation."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from typing import List, Optional, Tuple
|
|
6
|
+
|
|
7
|
+
import torch
|
|
8
|
+
from tgt import Interval, IntervalTier, TextGrid
|
|
9
|
+
|
|
10
|
+
from lattifai.audio2 import AudioData
|
|
11
|
+
from lattifai.caption import Supervision
|
|
12
|
+
from lattifai.config.diarization import DiarizationConfig
|
|
13
|
+
from lattifai.logging import get_logger
|
|
14
|
+
|
|
15
|
+
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
|
16
|
+
logging.basicConfig(format=formatter, level=logging.INFO)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
NOT_KNOWN = "NotKnown"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class LattifAIDiarizer:
|
|
23
|
+
"""
|
|
24
|
+
LattifAI Diarizer implementation using pyannote.audio.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self, config: Optional[DiarizationConfig] = None):
|
|
28
|
+
"""
|
|
29
|
+
Initialize LattifAI diarizer.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
config: Diarization configuration
|
|
33
|
+
"""
|
|
34
|
+
if config is None:
|
|
35
|
+
config = DiarizationConfig()
|
|
36
|
+
|
|
37
|
+
self.config = config
|
|
38
|
+
self.logger = get_logger("diarization")
|
|
39
|
+
|
|
40
|
+
self._diarizer = None
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def name(self) -> str:
|
|
44
|
+
"""Human-readable name of the diarizer."""
|
|
45
|
+
return "LattifAI_Diarizer"
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def diarizer(self):
|
|
49
|
+
"""Lazy-load and return the diarization pipeline."""
|
|
50
|
+
if self._diarizer is None:
|
|
51
|
+
from lattifai_core.diarization import LattifAIDiarizer as CoreLattifAIDiarizer
|
|
52
|
+
|
|
53
|
+
self._diarizer = CoreLattifAIDiarizer(config=self.config)
|
|
54
|
+
|
|
55
|
+
return self._diarizer
|
|
56
|
+
|
|
57
|
+
def diarize(
|
|
58
|
+
self,
|
|
59
|
+
input_media: AudioData,
|
|
60
|
+
num_speakers: Optional[int] = None,
|
|
61
|
+
min_speakers: Optional[int] = None,
|
|
62
|
+
max_speakers: Optional[int] = None,
|
|
63
|
+
) -> TextGrid:
|
|
64
|
+
"""Perform speaker diarization on the input audio."""
|
|
65
|
+
return self.diarizer.diarize(
|
|
66
|
+
input_media,
|
|
67
|
+
num_speakers=num_speakers,
|
|
68
|
+
min_speakers=min_speakers,
|
|
69
|
+
max_speakers=max_speakers,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
def diarize_with_alignments(
|
|
73
|
+
self,
|
|
74
|
+
input_media: AudioData,
|
|
75
|
+
alignments: List[Supervision],
|
|
76
|
+
diarization: Optional[TextGrid] = None,
|
|
77
|
+
num_speakers: Optional[int] = None,
|
|
78
|
+
min_speakers: Optional[int] = None,
|
|
79
|
+
max_speakers: Optional[int] = None,
|
|
80
|
+
) -> Tuple[TextGrid, List[Supervision]]:
|
|
81
|
+
"""Diarize the given media input and return alignments with refined speaker labels."""
|
|
82
|
+
return self.diarizer.diarize_with_alignments(
|
|
83
|
+
input_media,
|
|
84
|
+
alignments=alignments,
|
|
85
|
+
diarization=diarization,
|
|
86
|
+
num_speakers=num_speakers,
|
|
87
|
+
min_speakers=min_speakers,
|
|
88
|
+
max_speakers=max_speakers,
|
|
89
|
+
)
|
lattifai/errors.py
CHANGED
|
@@ -8,15 +8,15 @@ import colorful
|
|
|
8
8
|
# Error help messages
|
|
9
9
|
LATTICE_DECODING_FAILURE_HELP = (
|
|
10
10
|
"Failed to decode lattice alignment. Possible reasons:\n\n"
|
|
11
|
-
"1) Audio and text content mismatch:\n"
|
|
12
|
-
" - The transcript/
|
|
13
|
-
" - Text may be from a different version or section of the
|
|
14
|
-
" ⚠️ Note: Gemini transcription may occasionally skip large segments of
|
|
11
|
+
"1) Media(Audio/Video) and text content mismatch:\n"
|
|
12
|
+
" - The transcript/caption does not accurately match the media content\n"
|
|
13
|
+
" - Text may be from a different version or section of the media\n"
|
|
14
|
+
" ⚠️ Note: Gemini transcription may occasionally skip large segments of media, causing alignment failures.\n"
|
|
15
15
|
" We will detect and fix this issue in the next version.\n\n"
|
|
16
|
-
"2) Unsupported
|
|
16
|
+
"2) Unsupported media type:\n"
|
|
17
17
|
" - Singing is not yet supported, this will be optimized in future versions\n\n"
|
|
18
18
|
"💡 Troubleshooting tips:\n"
|
|
19
|
-
" • Verify the transcript matches the
|
|
19
|
+
" • Verify the transcript matches the media by listening to a few segments\n"
|
|
20
20
|
" • For YouTube videos, manually check if auto-generated transcript are accurate\n"
|
|
21
21
|
" • Consider using a different transcription source if Gemini results are incomplete"
|
|
22
22
|
)
|
|
@@ -45,7 +45,7 @@ class LattifAIError(Exception):
|
|
|
45
45
|
f' 1. 📝 Create a GitHub issue: {colorful.green("https://github.com/lattifai/lattifai-python/issues")}\n'
|
|
46
46
|
" Please include:\n"
|
|
47
47
|
" - Your audio file format and duration\n"
|
|
48
|
-
" - The text/
|
|
48
|
+
" - The text/caption content you're trying to align\n"
|
|
49
49
|
" - This error message and stack trace\n"
|
|
50
50
|
f' 2. 💬 Join our Discord community: {colorful.green("https://discord.gg/vzmTzzZgNu")}\n'
|
|
51
51
|
" Our team and community can help you troubleshoot\n"
|
|
@@ -71,10 +71,10 @@ class LattifAIError(Exception):
|
|
|
71
71
|
class AudioProcessingError(LattifAIError):
|
|
72
72
|
"""Error during audio processing operations."""
|
|
73
73
|
|
|
74
|
-
def __init__(self, message: str,
|
|
74
|
+
def __init__(self, message: str, media_path: Optional[str] = None, **kwargs):
|
|
75
75
|
context = kwargs.get("context", {})
|
|
76
|
-
if
|
|
77
|
-
context["
|
|
76
|
+
if media_path:
|
|
77
|
+
context["media_path"] = media_path
|
|
78
78
|
kwargs["context"] = context
|
|
79
79
|
super().__init__(message, **kwargs)
|
|
80
80
|
|
|
@@ -82,60 +82,60 @@ class AudioProcessingError(LattifAIError):
|
|
|
82
82
|
class AudioLoadError(AudioProcessingError):
|
|
83
83
|
"""Error loading or reading audio file."""
|
|
84
84
|
|
|
85
|
-
def __init__(self,
|
|
86
|
-
message = f"Failed to load audio file: {colorful.red(
|
|
85
|
+
def __init__(self, media_path: str, original_error: Optional[Exception] = None, **kwargs):
|
|
86
|
+
message = f"Failed to load audio file: {colorful.red(media_path)}"
|
|
87
87
|
if original_error:
|
|
88
88
|
message += f" - {colorful.red(str(original_error))}"
|
|
89
89
|
|
|
90
90
|
context = kwargs.get("context", {})
|
|
91
|
-
context.update({"
|
|
91
|
+
context.update({"media_path": media_path, "original_error": str(original_error) if original_error else None})
|
|
92
92
|
kwargs["context"] = context
|
|
93
93
|
|
|
94
|
-
super().__init__(message,
|
|
94
|
+
super().__init__(message, media_path=media_path, **kwargs)
|
|
95
95
|
|
|
96
96
|
|
|
97
97
|
class AudioFormatError(AudioProcessingError):
|
|
98
98
|
"""Error with audio format or codec."""
|
|
99
99
|
|
|
100
|
-
def __init__(self,
|
|
101
|
-
message = f"Audio format error for {colorful.red(
|
|
100
|
+
def __init__(self, media_path: str, format_issue: str, **kwargs):
|
|
101
|
+
message = f"Audio format error for {colorful.red(media_path)}: {colorful.red(format_issue)}"
|
|
102
102
|
context = kwargs.get("context", {})
|
|
103
|
-
context.update({"
|
|
103
|
+
context.update({"media_path": media_path, "format_issue": format_issue})
|
|
104
104
|
kwargs["context"] = context
|
|
105
|
-
super().__init__(message,
|
|
105
|
+
super().__init__(message, media_path=media_path, **kwargs)
|
|
106
106
|
|
|
107
107
|
|
|
108
|
-
class
|
|
109
|
-
"""Error during
|
|
108
|
+
class CaptionProcessingError(LattifAIError):
|
|
109
|
+
"""Error during caption/text processing operations."""
|
|
110
110
|
|
|
111
|
-
def __init__(self, message: str,
|
|
111
|
+
def __init__(self, message: str, caption_path: Optional[str] = None, **kwargs):
|
|
112
112
|
context = kwargs.get("context", {})
|
|
113
|
-
if
|
|
114
|
-
context["
|
|
113
|
+
if caption_path:
|
|
114
|
+
context["caption_path"] = caption_path
|
|
115
115
|
kwargs["context"] = context
|
|
116
116
|
super().__init__(message, **kwargs)
|
|
117
117
|
|
|
118
118
|
|
|
119
|
-
class
|
|
120
|
-
"""Error parsing
|
|
119
|
+
class CaptionParseError(CaptionProcessingError):
|
|
120
|
+
"""Error parsing caption or text file."""
|
|
121
121
|
|
|
122
|
-
def __init__(self,
|
|
123
|
-
message = f"Failed to parse
|
|
122
|
+
def __init__(self, caption_path: str, parse_issue: str, **kwargs):
|
|
123
|
+
message = f"Failed to parse caption file {caption_path}: {parse_issue}"
|
|
124
124
|
context = kwargs.get("context", {})
|
|
125
|
-
context.update({"
|
|
125
|
+
context.update({"caption_path": caption_path, "parse_issue": parse_issue})
|
|
126
126
|
kwargs["context"] = context
|
|
127
|
-
super().__init__(message,
|
|
127
|
+
super().__init__(message, caption_path=caption_path, **kwargs)
|
|
128
128
|
|
|
129
129
|
|
|
130
130
|
class AlignmentError(LattifAIError):
|
|
131
131
|
"""Error during audio-text alignment process."""
|
|
132
132
|
|
|
133
|
-
def __init__(self, message: str,
|
|
133
|
+
def __init__(self, message: str, media_path: Optional[str] = None, caption_path: Optional[str] = None, **kwargs):
|
|
134
134
|
context = kwargs.get("context", {})
|
|
135
|
-
if
|
|
136
|
-
context["
|
|
137
|
-
if
|
|
138
|
-
context["
|
|
135
|
+
if media_path:
|
|
136
|
+
context["media_path"] = media_path
|
|
137
|
+
if caption_path:
|
|
138
|
+
context["caption_path"] = caption_path
|
|
139
139
|
kwargs["context"] = context
|
|
140
140
|
super().__init__(message, **kwargs)
|
|
141
141
|
|
|
@@ -235,6 +235,13 @@ class ConfigurationError(LattifAIError):
|
|
|
235
235
|
super().__init__(message, **kwargs)
|
|
236
236
|
|
|
237
237
|
|
|
238
|
+
class QuotaExceededError(APIError):
|
|
239
|
+
"""Error when user quota or API key limit is exceeded."""
|
|
240
|
+
|
|
241
|
+
def __init__(self, message: str, **kwargs):
|
|
242
|
+
super().__init__(message, status_code=402, **kwargs)
|
|
243
|
+
|
|
244
|
+
|
|
238
245
|
def handle_exception(func):
|
|
239
246
|
"""Decorator to handle exceptions and convert them to LattifAI errors."""
|
|
240
247
|
|