lattifai 0.4.5__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/__init__.py +61 -47
- lattifai/alignment/__init__.py +6 -0
- lattifai/alignment/lattice1_aligner.py +119 -0
- lattifai/alignment/lattice1_worker.py +185 -0
- lattifai/{tokenizer → alignment}/phonemizer.py +4 -4
- lattifai/alignment/segmenter.py +166 -0
- lattifai/{tokenizer → alignment}/tokenizer.py +244 -169
- lattifai/audio2.py +211 -0
- lattifai/caption/__init__.py +20 -0
- lattifai/caption/caption.py +1275 -0
- lattifai/{io → caption}/gemini_reader.py +30 -30
- lattifai/{io → caption}/gemini_writer.py +17 -17
- lattifai/{io → caption}/supervision.py +4 -3
- lattifai/caption/text_parser.py +145 -0
- lattifai/cli/__init__.py +17 -0
- lattifai/cli/alignment.py +153 -0
- lattifai/cli/caption.py +204 -0
- lattifai/cli/server.py +19 -0
- lattifai/cli/transcribe.py +197 -0
- lattifai/cli/youtube.py +128 -0
- lattifai/client.py +460 -251
- lattifai/config/__init__.py +20 -0
- lattifai/config/alignment.py +73 -0
- lattifai/config/caption.py +178 -0
- lattifai/config/client.py +46 -0
- lattifai/config/diarization.py +67 -0
- lattifai/config/media.py +335 -0
- lattifai/config/transcription.py +84 -0
- lattifai/diarization/__init__.py +5 -0
- lattifai/diarization/lattifai.py +89 -0
- lattifai/errors.py +98 -91
- lattifai/logging.py +116 -0
- lattifai/mixin.py +552 -0
- lattifai/server/app.py +420 -0
- lattifai/transcription/__init__.py +76 -0
- lattifai/transcription/base.py +108 -0
- lattifai/transcription/gemini.py +219 -0
- lattifai/transcription/lattifai.py +103 -0
- lattifai/{workflows → transcription}/prompts/__init__.py +4 -4
- lattifai/types.py +30 -0
- lattifai/utils.py +16 -44
- lattifai/workflow/__init__.py +22 -0
- lattifai/workflow/agents.py +6 -0
- lattifai/{workflows → workflow}/base.py +22 -22
- lattifai/{workflows → workflow}/file_manager.py +239 -215
- lattifai/workflow/youtube.py +564 -0
- lattifai-1.0.0.dist-info/METADATA +736 -0
- lattifai-1.0.0.dist-info/RECORD +52 -0
- {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/WHEEL +1 -1
- lattifai-1.0.0.dist-info/entry_points.txt +13 -0
- {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/licenses/LICENSE +1 -1
- lattifai/base_client.py +0 -126
- lattifai/bin/__init__.py +0 -3
- lattifai/bin/agent.py +0 -325
- lattifai/bin/align.py +0 -296
- lattifai/bin/cli_base.py +0 -25
- lattifai/bin/subtitle.py +0 -210
- lattifai/io/__init__.py +0 -42
- lattifai/io/reader.py +0 -85
- lattifai/io/text_parser.py +0 -75
- lattifai/io/utils.py +0 -15
- lattifai/io/writer.py +0 -90
- lattifai/tokenizer/__init__.py +0 -3
- lattifai/workers/__init__.py +0 -3
- lattifai/workers/lattice1_alpha.py +0 -284
- lattifai/workflows/__init__.py +0 -34
- lattifai/workflows/agents.py +0 -10
- lattifai/workflows/gemini.py +0 -167
- lattifai/workflows/prompts/README.md +0 -22
- lattifai/workflows/prompts/gemini/README.md +0 -24
- lattifai/workflows/prompts/gemini/transcription_gem.txt +0 -81
- lattifai/workflows/youtube.py +0 -931
- lattifai-0.4.5.dist-info/METADATA +0 -808
- lattifai-0.4.5.dist-info/RECORD +0 -39
- lattifai-0.4.5.dist-info/entry_points.txt +0 -3
- {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/top_level.txt +0 -0
lattifai/config/media.py
ADDED
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
"""Media I/O configuration for LattifAI."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field, replace
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
from urllib.parse import urlparse
|
|
7
|
+
|
|
8
|
+
from lhotse.utils import Pathlike
|
|
9
|
+
|
|
10
|
+
# Supported media formats for both audio and video content
|
|
11
|
+
AUDIO_FORMATS = (
|
|
12
|
+
"aac",
|
|
13
|
+
"aiff",
|
|
14
|
+
"alac",
|
|
15
|
+
"flac",
|
|
16
|
+
"m4a",
|
|
17
|
+
"mp3",
|
|
18
|
+
"ogg",
|
|
19
|
+
"opus",
|
|
20
|
+
"wav",
|
|
21
|
+
"wma",
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
VIDEO_FORMATS = (
|
|
25
|
+
"3gp",
|
|
26
|
+
"avi",
|
|
27
|
+
"flv",
|
|
28
|
+
"m4v",
|
|
29
|
+
"mkv",
|
|
30
|
+
"mov",
|
|
31
|
+
"mp4",
|
|
32
|
+
"mpeg",
|
|
33
|
+
"mpg",
|
|
34
|
+
"webm",
|
|
35
|
+
"wmv",
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
MEDIA_FORMATS = tuple(sorted(set(AUDIO_FORMATS + VIDEO_FORMATS)))
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class MediaConfig:
|
|
43
|
+
"""Unified configuration for audio/video input and output handling."""
|
|
44
|
+
|
|
45
|
+
# Input configuration (local filesystem path or URL)
|
|
46
|
+
input_path: Optional[str] = None
|
|
47
|
+
"""Local file path or URL to audio/video content."""
|
|
48
|
+
|
|
49
|
+
media_format: str = "auto"
|
|
50
|
+
"""Media format (mp3, wav, mp4, etc.) or 'auto' for automatic detection."""
|
|
51
|
+
|
|
52
|
+
sample_rate: Optional[int] = None
|
|
53
|
+
"""Audio sample rate in Hz (e.g., 16000, 44100)."""
|
|
54
|
+
|
|
55
|
+
channels: Optional[int] = None
|
|
56
|
+
"""Number of audio channels (1=mono, 2=stereo)."""
|
|
57
|
+
|
|
58
|
+
channel_selector: Optional[str | int] = "average"
|
|
59
|
+
"""Audio channel selection strategy: 'average', 'left', 'right', or channel index."""
|
|
60
|
+
|
|
61
|
+
# Output / download configuration
|
|
62
|
+
output_dir: Path = field(default_factory=lambda: Path.cwd())
|
|
63
|
+
"""Directory for output files (default: current working directory)."""
|
|
64
|
+
|
|
65
|
+
output_path: Optional[str] = None
|
|
66
|
+
"""Full path for output file (overrides output_dir + filename)."""
|
|
67
|
+
|
|
68
|
+
output_format: Optional[str] = None
|
|
69
|
+
"""Output media format (mp3, wav, mp4, etc.)."""
|
|
70
|
+
|
|
71
|
+
prefer_audio: bool = True
|
|
72
|
+
"""Prefer audio format when 'auto' is specified."""
|
|
73
|
+
|
|
74
|
+
default_audio_format: str = "mp3"
|
|
75
|
+
"""Default audio format when no format is specified."""
|
|
76
|
+
|
|
77
|
+
default_video_format: str = "mp4"
|
|
78
|
+
"""Default video format when no format is specified."""
|
|
79
|
+
|
|
80
|
+
force_overwrite: bool = False
|
|
81
|
+
"""Overwrite existing output files without prompting."""
|
|
82
|
+
|
|
83
|
+
def __post_init__(self) -> None:
|
|
84
|
+
"""Validate configuration and normalize paths/formats."""
|
|
85
|
+
self._setup_output_directory()
|
|
86
|
+
self._validate_default_formats()
|
|
87
|
+
self._normalize_media_format()
|
|
88
|
+
self._process_input_path()
|
|
89
|
+
self._process_output_path()
|
|
90
|
+
|
|
91
|
+
def _setup_output_directory(self) -> None:
|
|
92
|
+
"""Ensure output directory exists and is valid."""
|
|
93
|
+
resolved_output_dir = self._ensure_dir(self.output_dir)
|
|
94
|
+
self.output_dir = resolved_output_dir
|
|
95
|
+
|
|
96
|
+
def _validate_default_formats(self) -> None:
|
|
97
|
+
"""Validate default audio and video formats."""
|
|
98
|
+
self.default_audio_format = self._normalize_format(self.default_audio_format)
|
|
99
|
+
self.default_video_format = self._normalize_format(self.default_video_format)
|
|
100
|
+
|
|
101
|
+
def _normalize_media_format(self) -> None:
|
|
102
|
+
"""Normalize media format, allowing 'auto' during initialization."""
|
|
103
|
+
self.media_format = self._normalize_format(self.media_format, allow_auto=True)
|
|
104
|
+
|
|
105
|
+
def _process_input_path(self) -> None:
|
|
106
|
+
"""Process and validate input path if provided."""
|
|
107
|
+
if self.input_path is None:
|
|
108
|
+
return
|
|
109
|
+
|
|
110
|
+
if self._is_url(self.input_path):
|
|
111
|
+
normalized_url = self._normalize_url(self.input_path)
|
|
112
|
+
self.input_path = normalized_url
|
|
113
|
+
if self.media_format == "auto":
|
|
114
|
+
inferred_format = self._infer_format_from_source(normalized_url)
|
|
115
|
+
if inferred_format:
|
|
116
|
+
self.media_format = self._normalize_format(inferred_format)
|
|
117
|
+
else:
|
|
118
|
+
# For local paths, normalize to string without validation here
|
|
119
|
+
# Validation will be done in check_input_sanity()
|
|
120
|
+
self.input_path = str(Path(self.input_path).expanduser())
|
|
121
|
+
if self.media_format == "auto":
|
|
122
|
+
inferred_format = Path(self.input_path).suffix.lstrip(".").lower()
|
|
123
|
+
if inferred_format:
|
|
124
|
+
self.media_format = self._normalize_format(inferred_format)
|
|
125
|
+
|
|
126
|
+
# Validate input after setting
|
|
127
|
+
self.check_input_sanity()
|
|
128
|
+
|
|
129
|
+
def _process_output_path(self) -> None:
|
|
130
|
+
"""Process output path and format."""
|
|
131
|
+
if self.output_path is not None:
|
|
132
|
+
self.set_output_path(self.output_path)
|
|
133
|
+
elif self.output_format is not None:
|
|
134
|
+
self.output_format = self._normalize_format(self.output_format)
|
|
135
|
+
else:
|
|
136
|
+
self.output_format = None
|
|
137
|
+
|
|
138
|
+
# ------------------------------------------------------------------
|
|
139
|
+
# Public helpers
|
|
140
|
+
# ------------------------------------------------------------------
|
|
141
|
+
def clone(self, **updates: object) -> "MediaConfig":
|
|
142
|
+
"""Return a shallow copy of the config with optional overrides."""
|
|
143
|
+
return replace(self, **updates)
|
|
144
|
+
|
|
145
|
+
def normalize_format(self, media_format: Optional[str] = None, *, prefer_audio: Optional[bool] = None) -> str:
|
|
146
|
+
"""Resolve a media format (handling the special "auto" value)."""
|
|
147
|
+
prefer_audio = self.prefer_audio if prefer_audio is None else prefer_audio
|
|
148
|
+
candidate = (media_format or self.media_format or "auto").lower()
|
|
149
|
+
if candidate == "auto":
|
|
150
|
+
candidate = self.default_audio_format if prefer_audio else self.default_video_format
|
|
151
|
+
return self._normalize_format(candidate)
|
|
152
|
+
|
|
153
|
+
def is_audio_format(self, media_format: Optional[str] = None) -> bool:
|
|
154
|
+
"""Check whether the provided (or effective) format is an audio format."""
|
|
155
|
+
return self.normalize_format(media_format) in AUDIO_FORMATS
|
|
156
|
+
|
|
157
|
+
def is_video_format(self, media_format: Optional[str] = None) -> bool:
|
|
158
|
+
"""Check whether the provided (or effective) format is a video format."""
|
|
159
|
+
return self.normalize_format(media_format) in VIDEO_FORMATS
|
|
160
|
+
|
|
161
|
+
def set_media_format(self, media_format: Optional[str], *, prefer_audio: Optional[bool] = None) -> str:
|
|
162
|
+
"""Update media_format and return the normalized value."""
|
|
163
|
+
normalized = self.normalize_format(media_format, prefer_audio=prefer_audio)
|
|
164
|
+
self.media_format = normalized
|
|
165
|
+
return normalized
|
|
166
|
+
|
|
167
|
+
def set_input_path(self, path: Pathlike) -> Path | str:
|
|
168
|
+
"""Update the input path (local path or URL) and infer format if possible."""
|
|
169
|
+
path = str(path)
|
|
170
|
+
if self._is_url(path):
|
|
171
|
+
normalized_url = self._normalize_url(path)
|
|
172
|
+
self.input_path = normalized_url
|
|
173
|
+
inferred_format = self._infer_format_from_source(normalized_url)
|
|
174
|
+
if inferred_format:
|
|
175
|
+
self.media_format = self._normalize_format(inferred_format)
|
|
176
|
+
self.check_input_sanity()
|
|
177
|
+
return normalized_url
|
|
178
|
+
|
|
179
|
+
resolved = self._ensure_file(path)
|
|
180
|
+
self.input_path = str(resolved)
|
|
181
|
+
inferred_format = resolved.suffix.lstrip(".").lower()
|
|
182
|
+
if inferred_format:
|
|
183
|
+
self.media_format = self._normalize_format(inferred_format)
|
|
184
|
+
self.check_input_sanity()
|
|
185
|
+
return resolved
|
|
186
|
+
|
|
187
|
+
def set_output_dir(self, output_dir: Pathlike) -> Path:
|
|
188
|
+
"""Update the output directory (creating it if needed)."""
|
|
189
|
+
resolved = self._ensure_dir(output_dir)
|
|
190
|
+
self.output_dir = resolved
|
|
191
|
+
return resolved
|
|
192
|
+
|
|
193
|
+
def set_output_path(self, output_path: Pathlike) -> Path:
|
|
194
|
+
"""Update the output path and synchronize output format and directory."""
|
|
195
|
+
resolved = self._ensure_file(output_path, must_exist=False, create_parent=True)
|
|
196
|
+
if not resolved.suffix:
|
|
197
|
+
raise ValueError("output_path must include a filename with an extension.")
|
|
198
|
+
fmt = resolved.suffix.lstrip(".").lower()
|
|
199
|
+
self.output_path = str(resolved)
|
|
200
|
+
self.output_dir = resolved.parent
|
|
201
|
+
self.output_format = self._normalize_format(fmt)
|
|
202
|
+
return resolved
|
|
203
|
+
|
|
204
|
+
def prepare_output_path(self, stem: Optional[str] = None, format: Optional[str] = None) -> Path:
|
|
205
|
+
"""Return an output path, creating one if not set yet."""
|
|
206
|
+
if self.output_path:
|
|
207
|
+
return Path(self.output_path)
|
|
208
|
+
|
|
209
|
+
effective_format = self.normalize_format(format or self.output_format or self.media_format)
|
|
210
|
+
base_name = stem or (self._derive_input_stem() or "output")
|
|
211
|
+
candidate = self.output_dir / f"{base_name}.{effective_format}"
|
|
212
|
+
self.output_path = str(candidate)
|
|
213
|
+
self.output_format = effective_format
|
|
214
|
+
return candidate
|
|
215
|
+
|
|
216
|
+
def is_input_remote(self) -> bool:
|
|
217
|
+
"""Return True if the configured input is a URL."""
|
|
218
|
+
return bool(self.input_path and self._is_url(self.input_path))
|
|
219
|
+
|
|
220
|
+
def check_input_sanity(self) -> None:
|
|
221
|
+
"""
|
|
222
|
+
Validate that input_path is properly configured and accessible.
|
|
223
|
+
|
|
224
|
+
Raises:
|
|
225
|
+
ValueError: If input_path is not set or is invalid.
|
|
226
|
+
FileNotFoundError: If input_path is a local file that does not exist.
|
|
227
|
+
"""
|
|
228
|
+
if not self.input_path:
|
|
229
|
+
raise ValueError("input_path is required but not set in MediaConfig")
|
|
230
|
+
|
|
231
|
+
if self._is_url(self.input_path):
|
|
232
|
+
# For URLs, validate that it's properly formatted
|
|
233
|
+
try:
|
|
234
|
+
parsed = urlparse(self.input_path)
|
|
235
|
+
if not parsed.scheme or not parsed.netloc:
|
|
236
|
+
raise ValueError(
|
|
237
|
+
f"Invalid URL format for input_path: '{self.input_path}'. "
|
|
238
|
+
"URL must include scheme (http/https) and domain."
|
|
239
|
+
)
|
|
240
|
+
except (ValueError, AttributeError) as e:
|
|
241
|
+
# ValueError: Invalid URL format
|
|
242
|
+
# AttributeError: urlparse issues with malformed input
|
|
243
|
+
raise ValueError(f"Failed to parse input_path as URL: {e}") from e
|
|
244
|
+
else:
|
|
245
|
+
# For local files, validate that the file exists and is accessible
|
|
246
|
+
input_file = Path(self.input_path).expanduser()
|
|
247
|
+
if not input_file.exists():
|
|
248
|
+
raise FileNotFoundError(
|
|
249
|
+
f"Input media file does not exist: '{input_file}'. " "Please check the path and try again."
|
|
250
|
+
)
|
|
251
|
+
if not input_file.is_file():
|
|
252
|
+
raise ValueError(
|
|
253
|
+
f"Input media path is not a file: '{input_file}'. " "Expected a valid media file path."
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
# ------------------------------------------------------------------
|
|
257
|
+
# Internal utilities
|
|
258
|
+
# ------------------------------------------------------------------
|
|
259
|
+
def _ensure_dir(self, directory: Pathlike) -> Path:
|
|
260
|
+
path = Path(directory).expanduser()
|
|
261
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
262
|
+
if not path.is_dir():
|
|
263
|
+
raise NotADirectoryError(f"Output directory '{path}' is not a directory.")
|
|
264
|
+
return path
|
|
265
|
+
|
|
266
|
+
def _ensure_file(self, path: Pathlike, *, must_exist: bool = True, create_parent: bool = False) -> Path:
|
|
267
|
+
file_path = Path(path).expanduser()
|
|
268
|
+
if must_exist:
|
|
269
|
+
if not file_path.exists():
|
|
270
|
+
raise FileNotFoundError(f"Input media path '{file_path}' does not exist.")
|
|
271
|
+
if not file_path.is_file():
|
|
272
|
+
raise ValueError(f"Input media path '{file_path}' is not a file.")
|
|
273
|
+
else:
|
|
274
|
+
if create_parent:
|
|
275
|
+
file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
276
|
+
return file_path
|
|
277
|
+
|
|
278
|
+
def _normalize_format(self, media_format: Optional[str], *, allow_auto: bool = False) -> str:
|
|
279
|
+
if media_format is None:
|
|
280
|
+
raise ValueError("media_format cannot be None")
|
|
281
|
+
normalized = media_format.strip().lower()
|
|
282
|
+
if not normalized:
|
|
283
|
+
raise ValueError("media_format cannot be empty")
|
|
284
|
+
if normalized == "auto":
|
|
285
|
+
if allow_auto:
|
|
286
|
+
return normalized
|
|
287
|
+
normalized = self.default_audio_format if self.prefer_audio else self.default_video_format
|
|
288
|
+
if normalized not in MEDIA_FORMATS:
|
|
289
|
+
raise ValueError(
|
|
290
|
+
"Unsupported media format '{fmt}'. Supported formats: {supported}".format(
|
|
291
|
+
fmt=media_format,
|
|
292
|
+
supported=", ".join(MEDIA_FORMATS),
|
|
293
|
+
)
|
|
294
|
+
)
|
|
295
|
+
return normalized
|
|
296
|
+
|
|
297
|
+
def _clean_url_escapes(self, url: str) -> str:
|
|
298
|
+
"""Remove shell escape backslashes from URL special characters."""
|
|
299
|
+
return url.strip().replace(r"\?", "?").replace(r"\=", "=").replace(r"\&", "&")
|
|
300
|
+
|
|
301
|
+
def _is_url(self, value: Pathlike) -> bool:
|
|
302
|
+
if not isinstance(value, str):
|
|
303
|
+
return False
|
|
304
|
+
cleaned = self._clean_url_escapes(value)
|
|
305
|
+
parsed = urlparse(cleaned)
|
|
306
|
+
return bool(parsed.scheme and parsed.netloc)
|
|
307
|
+
|
|
308
|
+
def _normalize_url(self, url: str) -> str:
|
|
309
|
+
cleaned = self._clean_url_escapes(url)
|
|
310
|
+
parsed = urlparse(cleaned)
|
|
311
|
+
if not parsed.scheme or not parsed.netloc:
|
|
312
|
+
raise ValueError("input_path must be an absolute URL when provided as a remote source.")
|
|
313
|
+
return cleaned
|
|
314
|
+
|
|
315
|
+
def _infer_format_from_source(self, source: str) -> Optional[str]:
|
|
316
|
+
path_segment = Path(urlparse(source).path) if self._is_url(source) else Path(source)
|
|
317
|
+
suffix = path_segment.suffix.lstrip(".").lower()
|
|
318
|
+
return suffix or None
|
|
319
|
+
|
|
320
|
+
def _derive_input_stem(self) -> Optional[str]:
|
|
321
|
+
if not self.input_path:
|
|
322
|
+
return None
|
|
323
|
+
if self.is_input_remote():
|
|
324
|
+
path_segment = Path(urlparse(self.input_path).path)
|
|
325
|
+
stem = path_segment.stem
|
|
326
|
+
return stem or None
|
|
327
|
+
return Path(self.input_path).stem or None
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
__all__ = [
|
|
331
|
+
"MediaConfig",
|
|
332
|
+
"AUDIO_FORMATS",
|
|
333
|
+
"VIDEO_FORMATS",
|
|
334
|
+
"MEDIA_FORMATS",
|
|
335
|
+
]
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""Transcription service configuration for LattifAI."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from typing import TYPE_CHECKING, Literal, Optional
|
|
6
|
+
|
|
7
|
+
from ..utils import _select_device
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from ..base_client import SyncAPIClient
|
|
11
|
+
|
|
12
|
+
SUPPORTED_TRANSCRIPTION_MODELS = Literal[
|
|
13
|
+
"gemini-2.5-pro",
|
|
14
|
+
"gemini-3-pro-preview",
|
|
15
|
+
"nvidia/parakeet-tdt-0.6b-v3",
|
|
16
|
+
"nvidia/canary-1b-v2",
|
|
17
|
+
"iic/SenseVoiceSmall",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class TranscriptionConfig:
|
|
23
|
+
"""
|
|
24
|
+
Transcription service configuration.
|
|
25
|
+
|
|
26
|
+
Settings for audio/video transcription using various providers.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
model_name: SUPPORTED_TRANSCRIPTION_MODELS = "nvidia/parakeet-tdt-0.6b-v3"
|
|
30
|
+
"""Model name for transcription."""
|
|
31
|
+
|
|
32
|
+
gemini_api_key: Optional[str] = None
|
|
33
|
+
"""Gemini API key. If None, reads from GEMINI_API_KEY environment variable."""
|
|
34
|
+
|
|
35
|
+
device: Literal["cpu", "cuda", "mps", "auto"] = "auto"
|
|
36
|
+
"""Computation device for transcription models."""
|
|
37
|
+
|
|
38
|
+
max_retries: int = 0
|
|
39
|
+
"""Maximum number of retry attempts for failed transcription requests."""
|
|
40
|
+
|
|
41
|
+
force_overwrite: bool = False
|
|
42
|
+
"""Force overwrite existing transcription files."""
|
|
43
|
+
|
|
44
|
+
verbose: bool = False
|
|
45
|
+
"""Enable debug logging for transcription operations."""
|
|
46
|
+
|
|
47
|
+
language: Optional[str] = None
|
|
48
|
+
"""Target language code for transcription (e.g., 'en', 'zh', 'ja')."""
|
|
49
|
+
|
|
50
|
+
lattice_model_path: Optional[str] = None
|
|
51
|
+
"""Path to local LattifAI model. Will be auto-set in LattifAI client."""
|
|
52
|
+
|
|
53
|
+
client_wrapper: Optional["SyncAPIClient"] = field(default=None, repr=False)
|
|
54
|
+
"""Reference to the SyncAPIClient instance. Auto-set during client initialization."""
|
|
55
|
+
|
|
56
|
+
def __post_init__(self):
|
|
57
|
+
"""Validate and auto-populate configuration after initialization."""
|
|
58
|
+
|
|
59
|
+
if self.model_name not in SUPPORTED_TRANSCRIPTION_MODELS.__args__:
|
|
60
|
+
raise ValueError(
|
|
61
|
+
f"Unsupported model_name: '{self.model_name}'. "
|
|
62
|
+
f"Supported models are: {SUPPORTED_TRANSCRIPTION_MODELS.__args__}"
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
# Load environment variables from .env file
|
|
66
|
+
from dotenv import find_dotenv, load_dotenv
|
|
67
|
+
|
|
68
|
+
# Try to find and load .env file from current directory or parent directories
|
|
69
|
+
load_dotenv(find_dotenv(usecwd=True))
|
|
70
|
+
|
|
71
|
+
# Auto-load Gemini API key from environment if not provided
|
|
72
|
+
if self.gemini_api_key is None:
|
|
73
|
+
self.gemini_api_key = os.environ.get("GEMINI_API_KEY")
|
|
74
|
+
|
|
75
|
+
# Validate max_retries
|
|
76
|
+
if self.max_retries < 0:
|
|
77
|
+
raise ValueError("max_retries must be non-negative")
|
|
78
|
+
|
|
79
|
+
# Validate device
|
|
80
|
+
if self.device not in ("cpu", "cuda", "mps", "auto") and not self.device.startswith("cuda:"):
|
|
81
|
+
raise ValueError(f"device must be one of ('cpu', 'cuda', 'mps', 'auto'), got '{self.device}'")
|
|
82
|
+
|
|
83
|
+
if self.device == "auto":
|
|
84
|
+
self.device = _select_device(self.device)
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""LattifAI speaker diarization implementation."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from typing import List, Optional, Tuple
|
|
6
|
+
|
|
7
|
+
import torch
|
|
8
|
+
from tgt import Interval, IntervalTier, TextGrid
|
|
9
|
+
|
|
10
|
+
from lattifai.audio2 import AudioData
|
|
11
|
+
from lattifai.caption import Supervision
|
|
12
|
+
from lattifai.config.diarization import DiarizationConfig
|
|
13
|
+
from lattifai.logging import get_logger
|
|
14
|
+
|
|
15
|
+
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
|
16
|
+
logging.basicConfig(format=formatter, level=logging.INFO)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
NOT_KNOWN = "NotKnown"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class LattifAIDiarizer:
|
|
23
|
+
"""
|
|
24
|
+
LattifAI Diarizer implementation using pyannote.audio.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self, config: Optional[DiarizationConfig] = None):
|
|
28
|
+
"""
|
|
29
|
+
Initialize LattifAI diarizer.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
config: Diarization configuration
|
|
33
|
+
"""
|
|
34
|
+
if config is None:
|
|
35
|
+
config = DiarizationConfig()
|
|
36
|
+
|
|
37
|
+
self.config = config
|
|
38
|
+
self.logger = get_logger("diarization")
|
|
39
|
+
|
|
40
|
+
self._diarizer = None
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def name(self) -> str:
|
|
44
|
+
"""Human-readable name of the diarizer."""
|
|
45
|
+
return "LattifAI_Diarizer"
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def diarizer(self):
|
|
49
|
+
"""Lazy-load and return the diarization pipeline."""
|
|
50
|
+
if self._diarizer is None:
|
|
51
|
+
from lattifai_core.diarization import LattifAIDiarizer as CoreLattifAIDiarizer
|
|
52
|
+
|
|
53
|
+
self._diarizer = CoreLattifAIDiarizer(config=self.config)
|
|
54
|
+
|
|
55
|
+
return self._diarizer
|
|
56
|
+
|
|
57
|
+
def diarize(
|
|
58
|
+
self,
|
|
59
|
+
input_media: AudioData,
|
|
60
|
+
num_speakers: Optional[int] = None,
|
|
61
|
+
min_speakers: Optional[int] = None,
|
|
62
|
+
max_speakers: Optional[int] = None,
|
|
63
|
+
) -> TextGrid:
|
|
64
|
+
"""Perform speaker diarization on the input audio."""
|
|
65
|
+
return self.diarizer.diarize(
|
|
66
|
+
input_media,
|
|
67
|
+
num_speakers=num_speakers,
|
|
68
|
+
min_speakers=min_speakers,
|
|
69
|
+
max_speakers=max_speakers,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
def diarize_with_alignments(
|
|
73
|
+
self,
|
|
74
|
+
input_media: AudioData,
|
|
75
|
+
alignments: List[Supervision],
|
|
76
|
+
diarization: Optional[TextGrid] = None,
|
|
77
|
+
num_speakers: Optional[int] = None,
|
|
78
|
+
min_speakers: Optional[int] = None,
|
|
79
|
+
max_speakers: Optional[int] = None,
|
|
80
|
+
) -> Tuple[TextGrid, List[Supervision]]:
|
|
81
|
+
"""Diarize the given media input and return alignments with refined speaker labels."""
|
|
82
|
+
return self.diarizer.diarize_with_alignments(
|
|
83
|
+
input_media,
|
|
84
|
+
alignments=alignments,
|
|
85
|
+
diarization=diarization,
|
|
86
|
+
num_speakers=num_speakers,
|
|
87
|
+
min_speakers=min_speakers,
|
|
88
|
+
max_speakers=max_speakers,
|
|
89
|
+
)
|