lattifai 1.0.5__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/alignment/lattice1_aligner.py +31 -2
- lattifai/alignment/lattice1_worker.py +14 -0
- lattifai/alignment/tokenizer.py +11 -10
- lattifai/caption/caption.py +55 -19
- lattifai/cli/__init__.py +2 -0
- lattifai/cli/caption.py +1 -1
- lattifai/cli/diarization.py +108 -0
- lattifai/cli/transcribe.py +3 -1
- lattifai/client.py +27 -111
- lattifai/config/alignment.py +14 -0
- lattifai/config/transcription.py +4 -0
- lattifai/diarization/lattifai.py +18 -7
- lattifai/mixin.py +19 -1
- lattifai/transcription/__init__.py +1 -1
- lattifai/transcription/base.py +21 -2
- lattifai/transcription/gemini.py +127 -1
- lattifai/transcription/lattifai.py +30 -2
- lattifai/utils.py +59 -28
- {lattifai-1.0.5.dist-info → lattifai-1.1.0.dist-info}/METADATA +24 -10
- {lattifai-1.0.5.dist-info → lattifai-1.1.0.dist-info}/RECORD +24 -23
- {lattifai-1.0.5.dist-info → lattifai-1.1.0.dist-info}/entry_points.txt +2 -0
- {lattifai-1.0.5.dist-info → lattifai-1.1.0.dist-info}/WHEEL +0 -0
- {lattifai-1.0.5.dist-info → lattifai-1.1.0.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.0.5.dist-info → lattifai-1.1.0.dist-info}/top_level.txt +0 -0
lattifai/diarization/lattifai.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
"""LattifAI speaker diarization implementation."""
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
-
from
|
|
5
|
-
from typing import List, Optional, Tuple
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import Callable, List, Optional, Tuple
|
|
6
6
|
|
|
7
|
-
import
|
|
8
|
-
from
|
|
7
|
+
import numpy as np
|
|
8
|
+
from lattifai_core.diarization import DiarizationOutput
|
|
9
|
+
from tgt import TextGrid
|
|
9
10
|
|
|
10
11
|
from lattifai.audio2 import AudioData
|
|
11
12
|
from lattifai.caption import Supervision
|
|
@@ -60,7 +61,7 @@ class LattifAIDiarizer:
|
|
|
60
61
|
num_speakers: Optional[int] = None,
|
|
61
62
|
min_speakers: Optional[int] = None,
|
|
62
63
|
max_speakers: Optional[int] = None,
|
|
63
|
-
) ->
|
|
64
|
+
) -> DiarizationOutput:
|
|
64
65
|
"""Perform speaker diarization on the input audio."""
|
|
65
66
|
return self.diarizer.diarize(
|
|
66
67
|
input_media,
|
|
@@ -73,11 +74,16 @@ class LattifAIDiarizer:
|
|
|
73
74
|
self,
|
|
74
75
|
input_media: AudioData,
|
|
75
76
|
alignments: List[Supervision],
|
|
76
|
-
diarization: Optional[
|
|
77
|
+
diarization: Optional[DiarizationOutput] = None,
|
|
77
78
|
num_speakers: Optional[int] = None,
|
|
78
79
|
min_speakers: Optional[int] = None,
|
|
79
80
|
max_speakers: Optional[int] = None,
|
|
80
|
-
|
|
81
|
+
alignment_fn: Optional[Callable] = None,
|
|
82
|
+
transcribe_fn: Optional[Callable] = None,
|
|
83
|
+
separate_fn: Optional[Callable] = None,
|
|
84
|
+
debug: bool = False,
|
|
85
|
+
output_path: Optional[str] = None,
|
|
86
|
+
) -> Tuple[DiarizationOutput, List[Supervision]]:
|
|
81
87
|
"""Diarize the given media input and return alignments with refined speaker labels."""
|
|
82
88
|
return self.diarizer.diarize_with_alignments(
|
|
83
89
|
input_media,
|
|
@@ -86,4 +92,9 @@ class LattifAIDiarizer:
|
|
|
86
92
|
num_speakers=num_speakers,
|
|
87
93
|
min_speakers=min_speakers,
|
|
88
94
|
max_speakers=max_speakers,
|
|
95
|
+
alignment_fn=alignment_fn,
|
|
96
|
+
transcribe_fn=transcribe_fn,
|
|
97
|
+
separate_fn=separate_fn,
|
|
98
|
+
debug=debug,
|
|
99
|
+
output_path=output_path,
|
|
89
100
|
)
|
lattifai/mixin.py
CHANGED
|
@@ -184,7 +184,9 @@ class LattifAIClientMixin:
|
|
|
184
184
|
from lattifai.utils import _resolve_model_path
|
|
185
185
|
|
|
186
186
|
if transcription_config is not None:
|
|
187
|
-
transcription_config.lattice_model_path = _resolve_model_path(
|
|
187
|
+
transcription_config.lattice_model_path = _resolve_model_path(
|
|
188
|
+
alignment_config.model_name, getattr(alignment_config, "model_hub", "huggingface")
|
|
189
|
+
)
|
|
188
190
|
|
|
189
191
|
# Set client_wrapper for all configs
|
|
190
192
|
alignment_config.client_wrapper = self
|
|
@@ -380,6 +382,7 @@ class LattifAIClientMixin:
|
|
|
380
382
|
media_file: Union[str, Path, AudioData],
|
|
381
383
|
source_lang: Optional[str],
|
|
382
384
|
is_async: bool = False,
|
|
385
|
+
output_dir: Optional[Path] = None,
|
|
383
386
|
) -> Caption:
|
|
384
387
|
"""
|
|
385
388
|
Get captions by downloading or transcribing.
|
|
@@ -406,6 +409,9 @@ class LattifAIClientMixin:
|
|
|
406
409
|
safe_print(colorful.green(" ✓ Transcription completed."))
|
|
407
410
|
|
|
408
411
|
if "gemini" in self.transcriber.name.lower():
|
|
412
|
+
safe_print(colorful.yellow("🔍 Gemini raw output:"))
|
|
413
|
+
safe_print(colorful.yellow(f"{transcription[:1000]}...")) # Print first 1000 chars
|
|
414
|
+
|
|
409
415
|
# write to temp file and use Caption read
|
|
410
416
|
# On Windows, we need to close the file before writing to it
|
|
411
417
|
tmp_file = tempfile.NamedTemporaryFile(
|
|
@@ -428,6 +434,18 @@ class LattifAIClientMixin:
|
|
|
428
434
|
# Clean up temp file
|
|
429
435
|
if tmp_path.exists():
|
|
430
436
|
tmp_path.unlink()
|
|
437
|
+
else:
|
|
438
|
+
safe_print(colorful.yellow(f"🔍 {self.transcriber.name} raw output:"))
|
|
439
|
+
if isinstance(transcription, Caption):
|
|
440
|
+
safe_print(colorful.yellow(f"Caption with {len(transcription.transcription)} segments"))
|
|
441
|
+
if transcription.transcription:
|
|
442
|
+
safe_print(colorful.yellow(f"First segment: {transcription.transcription[0].text}"))
|
|
443
|
+
|
|
444
|
+
if output_dir:
|
|
445
|
+
# Generate transcript file path
|
|
446
|
+
transcript_file = output_dir / f"{Path(str(media_file)).stem}_{self.transcriber.file_name}"
|
|
447
|
+
await asyncio.to_thread(self.transcriber.write, transcription, transcript_file, encoding="utf-8")
|
|
448
|
+
safe_print(colorful.green(f" ✓ Transcription saved to: {transcript_file}"))
|
|
431
449
|
|
|
432
450
|
return transcription
|
|
433
451
|
|
|
@@ -70,7 +70,7 @@ def create_transcriber(
|
|
|
70
70
|
raise ValueError(
|
|
71
71
|
f"Cannot determine transcriber for model_name='{transcription_config.model_name}'. "
|
|
72
72
|
f"Supported patterns: \n"
|
|
73
|
-
f" - Gemini API models: 'gemini-2.5-pro', 'gemini-3-pro-preview'\n"
|
|
73
|
+
f" - Gemini API models: 'gemini-2.5-pro', 'gemini-3-pro-preview', 'gemini-3-flash-preview'\n"
|
|
74
74
|
f" - Local HF models: 'nvidia/parakeet-*', 'iic/SenseVoiceSmall', etc.\n"
|
|
75
75
|
f"Please specify a valid model_name."
|
|
76
76
|
)
|
lattifai/transcription/base.py
CHANGED
|
@@ -2,10 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Optional, Union
|
|
5
|
+
from typing import List, Optional, Union
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
6
8
|
|
|
7
9
|
from lattifai.audio2 import AudioData
|
|
8
|
-
from lattifai.caption import Caption
|
|
10
|
+
from lattifai.caption import Caption, Supervision
|
|
9
11
|
from lattifai.config import TranscriptionConfig
|
|
10
12
|
from lattifai.logging import get_logger
|
|
11
13
|
|
|
@@ -96,6 +98,23 @@ class BaseTranscriber(ABC):
|
|
|
96
98
|
language: Optional language code for transcription.
|
|
97
99
|
"""
|
|
98
100
|
|
|
101
|
+
@abstractmethod
|
|
102
|
+
def transcribe_numpy(
|
|
103
|
+
self,
|
|
104
|
+
audio: Union[np.ndarray, List[np.ndarray]],
|
|
105
|
+
language: Optional[str] = None,
|
|
106
|
+
) -> Union[Supervision, List[Supervision]]:
|
|
107
|
+
"""
|
|
108
|
+
Transcribe audio from a numpy array and return Supervision.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
audio_array: Audio data as numpy array (shape: [samples]).
|
|
112
|
+
language: Optional language code for transcription.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
Supervision object with transcription info.
|
|
116
|
+
"""
|
|
117
|
+
|
|
99
118
|
@abstractmethod
|
|
100
119
|
def write(self, transcript: Union[str, Caption], output_file: Path, encoding: str = "utf-8") -> Path:
|
|
101
120
|
"""
|
lattifai/transcription/gemini.py
CHANGED
|
@@ -2,12 +2,14 @@
|
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Optional, Union
|
|
5
|
+
from typing import List, Optional, Union
|
|
6
6
|
|
|
7
|
+
import numpy as np
|
|
7
8
|
from google import genai
|
|
8
9
|
from google.genai.types import GenerateContentConfig, Part, ThinkingConfig
|
|
9
10
|
|
|
10
11
|
from lattifai.audio2 import AudioData
|
|
12
|
+
from lattifai.caption import Supervision
|
|
11
13
|
from lattifai.config import TranscriptionConfig
|
|
12
14
|
from lattifai.transcription.base import BaseTranscriber
|
|
13
15
|
from lattifai.transcription.prompts import get_prompt_loader
|
|
@@ -118,6 +120,130 @@ class GeminiTranscriber(BaseTranscriber):
|
|
|
118
120
|
self.logger.error(f"Gemini transcription failed: {str(e)}")
|
|
119
121
|
raise RuntimeError(f"Gemini transcription failed: {str(e)}")
|
|
120
122
|
|
|
123
|
+
def transcribe_numpy(
|
|
124
|
+
self,
|
|
125
|
+
audio: Union[np.ndarray, List[np.ndarray]],
|
|
126
|
+
language: Optional[str] = None,
|
|
127
|
+
) -> Union[Supervision, List[Supervision]]:
|
|
128
|
+
"""
|
|
129
|
+
Transcribe audio from a numpy array (or list of arrays) and return Supervision.
|
|
130
|
+
|
|
131
|
+
Note: Gemini API does not support word-level alignment. The returned
|
|
132
|
+
Supervision will contain only the full transcription text without alignment.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
audio: Audio data as numpy array (shape: [samples]),
|
|
136
|
+
or a list of such arrays for batch processing.
|
|
137
|
+
language: Optional language code for transcription.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
Supervision object (or list of Supervision objects) with transcription text (no alignment).
|
|
141
|
+
|
|
142
|
+
Raises:
|
|
143
|
+
ValueError: If API key not provided
|
|
144
|
+
RuntimeError: If transcription fails
|
|
145
|
+
"""
|
|
146
|
+
# Handle batch processing
|
|
147
|
+
if isinstance(audio, list):
|
|
148
|
+
return [self.transcribe_numpy(arr, language=language) for arr in audio]
|
|
149
|
+
|
|
150
|
+
audio_array = audio
|
|
151
|
+
# Use default sample rate of 16000 Hz
|
|
152
|
+
sample_rate = 16000
|
|
153
|
+
|
|
154
|
+
if self.config.verbose:
|
|
155
|
+
self.logger.info(f"🎤 Starting Gemini transcription for numpy array (sample_rate={sample_rate})")
|
|
156
|
+
|
|
157
|
+
# Ensure audio is in the correct shape
|
|
158
|
+
if audio_array.ndim == 1:
|
|
159
|
+
audio_array = audio_array.reshape(1, -1)
|
|
160
|
+
elif audio_array.ndim > 2:
|
|
161
|
+
raise ValueError(f"Audio array must be 1D or 2D, got shape {audio_array.shape}")
|
|
162
|
+
|
|
163
|
+
# Save numpy array to temporary file
|
|
164
|
+
import tempfile
|
|
165
|
+
|
|
166
|
+
import soundfile as sf
|
|
167
|
+
|
|
168
|
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
|
|
169
|
+
# Transpose to (samples, channels) for soundfile
|
|
170
|
+
sf.write(tmp_file.name, audio_array.T, sample_rate)
|
|
171
|
+
tmp_path = Path(tmp_file.name)
|
|
172
|
+
|
|
173
|
+
try:
|
|
174
|
+
# Transcribe using simple ASR prompt
|
|
175
|
+
import asyncio
|
|
176
|
+
|
|
177
|
+
transcript = asyncio.run(self._transcribe_with_simple_prompt(tmp_path, language=language))
|
|
178
|
+
|
|
179
|
+
# Create Supervision object from transcript
|
|
180
|
+
duration = audio_array.shape[-1] / sample_rate
|
|
181
|
+
supervision = Supervision(
|
|
182
|
+
id="gemini-transcription",
|
|
183
|
+
recording_id="numpy-array",
|
|
184
|
+
start=0.0,
|
|
185
|
+
duration=duration,
|
|
186
|
+
text=transcript,
|
|
187
|
+
speaker=None,
|
|
188
|
+
alignment=None, # Gemini does not provide word-level alignment
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
return supervision
|
|
192
|
+
|
|
193
|
+
finally:
|
|
194
|
+
# Clean up temporary file
|
|
195
|
+
if tmp_path.exists():
|
|
196
|
+
tmp_path.unlink()
|
|
197
|
+
|
|
198
|
+
async def _transcribe_with_simple_prompt(self, media_file: Path, language: Optional[str] = None) -> str:
|
|
199
|
+
"""
|
|
200
|
+
Transcribe audio using a simple ASR prompt instead of complex instructions.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
media_file: Path to audio file
|
|
204
|
+
language: Optional language code
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
Transcribed text
|
|
208
|
+
"""
|
|
209
|
+
client = self._get_client()
|
|
210
|
+
|
|
211
|
+
# Upload audio file
|
|
212
|
+
if self.config.verbose:
|
|
213
|
+
self.logger.info("📤 Uploading audio file to Gemini...")
|
|
214
|
+
uploaded_file = client.files.upload(file=str(media_file))
|
|
215
|
+
|
|
216
|
+
# Simple ASR prompt
|
|
217
|
+
system_prompt = "Transcribe the audio."
|
|
218
|
+
if language:
|
|
219
|
+
system_prompt = f"Transcribe the audio in {language}."
|
|
220
|
+
|
|
221
|
+
# Create simple generation config
|
|
222
|
+
simple_config = GenerateContentConfig(
|
|
223
|
+
system_instruction=system_prompt,
|
|
224
|
+
response_modalities=["TEXT"],
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
contents = Part.from_uri(file_uri=uploaded_file.uri, mime_type=uploaded_file.mime_type)
|
|
228
|
+
response = await asyncio.get_event_loop().run_in_executor(
|
|
229
|
+
None,
|
|
230
|
+
lambda: client.models.generate_content(
|
|
231
|
+
model=self.config.model_name,
|
|
232
|
+
contents=contents,
|
|
233
|
+
config=simple_config,
|
|
234
|
+
),
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
if not response.text:
|
|
238
|
+
raise RuntimeError("Empty response from Gemini API")
|
|
239
|
+
|
|
240
|
+
transcript = response.text.strip()
|
|
241
|
+
|
|
242
|
+
if self.config.verbose:
|
|
243
|
+
self.logger.info(f"✅ Transcription completed: {len(transcript)} characters")
|
|
244
|
+
|
|
245
|
+
return transcript
|
|
246
|
+
|
|
121
247
|
def _get_transcription_prompt(self) -> str:
|
|
122
248
|
"""Get (and cache) transcription system prompt from prompts module."""
|
|
123
249
|
if self._system_prompt is not None:
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
"""Transcription module with config-driven architecture."""
|
|
2
2
|
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Optional, Union
|
|
4
|
+
from typing import List, Optional, Union
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
5
7
|
|
|
6
8
|
from lattifai.audio2 import AudioData
|
|
7
|
-
from lattifai.caption import Caption
|
|
9
|
+
from lattifai.caption import Caption, Supervision
|
|
8
10
|
from lattifai.config import TranscriptionConfig
|
|
9
11
|
from lattifai.transcription.base import BaseTranscriber
|
|
10
12
|
from lattifai.transcription.prompts import get_prompt_loader # noqa: F401
|
|
@@ -74,6 +76,32 @@ class LattifAITranscriber(BaseTranscriber):
|
|
|
74
76
|
|
|
75
77
|
return caption
|
|
76
78
|
|
|
79
|
+
def transcribe_numpy(
|
|
80
|
+
self,
|
|
81
|
+
audio: Union[np.ndarray, List[np.ndarray]],
|
|
82
|
+
language: Optional[str] = None,
|
|
83
|
+
) -> Union[Supervision, List[Supervision]]:
|
|
84
|
+
"""
|
|
85
|
+
Transcribe audio from a numpy array (or list of arrays) and return Supervision.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
audio: Audio data as numpy array (shape: [samples]),
|
|
89
|
+
or a list of such arrays for batch processing.
|
|
90
|
+
language: Optional language code for transcription.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Supervision object (or list of Supervision objects) with transcription and alignment info.
|
|
94
|
+
"""
|
|
95
|
+
if self._transcriber is None:
|
|
96
|
+
from lattifai_core.transcription import LattifAITranscriber as CoreLattifAITranscriber
|
|
97
|
+
|
|
98
|
+
self._transcriber = CoreLattifAITranscriber.from_pretrained(model_config=self.config)
|
|
99
|
+
|
|
100
|
+
# Delegate to core transcriber which handles both single arrays and lists
|
|
101
|
+
return self._transcriber.transcribe(
|
|
102
|
+
audio, language=language, return_hypotheses=True, progress_bar=False, timestamps=True
|
|
103
|
+
)[0]
|
|
104
|
+
|
|
77
105
|
def write(
|
|
78
106
|
self, transcript: Caption, output_file: Path, encoding: str = "utf-8", cache_audio_events: bool = True
|
|
79
107
|
) -> Path:
|
lattifai/utils.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
"""Shared utility helpers for the LattifAI SDK."""
|
|
2
2
|
|
|
3
|
-
import os
|
|
4
3
|
import sys
|
|
5
4
|
from datetime import datetime, timedelta
|
|
6
5
|
from pathlib import Path
|
|
7
|
-
from typing import
|
|
6
|
+
from typing import Optional
|
|
8
7
|
|
|
9
8
|
from lattifai.errors import ModelLoadError
|
|
10
9
|
|
|
@@ -88,42 +87,74 @@ def _create_cache_marker(cache_dir: Path) -> None:
|
|
|
88
87
|
marker_path.touch()
|
|
89
88
|
|
|
90
89
|
|
|
91
|
-
def _resolve_model_path(model_name_or_path: str) -> str:
|
|
92
|
-
"""Resolve model path, downloading from
|
|
90
|
+
def _resolve_model_path(model_name_or_path: str, model_hub: str = "huggingface") -> str:
|
|
91
|
+
"""Resolve model path, downloading from the specified model hub when necessary.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
model_name_or_path: Local path or remote model identifier.
|
|
95
|
+
model_hub: Which hub to use for downloads. Supported: "huggingface", "modelscope".
|
|
96
|
+
"""
|
|
93
97
|
if Path(model_name_or_path).expanduser().exists():
|
|
94
98
|
return str(Path(model_name_or_path).expanduser())
|
|
95
99
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
from huggingface_hub.errors import LocalEntryNotFoundError
|
|
100
|
+
# Normalize hub name
|
|
101
|
+
hub = (model_hub or "huggingface").lower()
|
|
99
102
|
|
|
100
|
-
|
|
101
|
-
|
|
103
|
+
if hub not in ("huggingface", "modelscope"):
|
|
104
|
+
raise ValueError(f"Unsupported model_hub: {model_hub}. Supported: 'huggingface', 'modelscope'.")
|
|
102
105
|
|
|
103
|
-
#
|
|
104
|
-
if
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
106
|
+
# If local path exists, return it regardless of hub
|
|
107
|
+
if Path(model_name_or_path).expanduser().exists():
|
|
108
|
+
return str(Path(model_name_or_path).expanduser())
|
|
109
|
+
|
|
110
|
+
if hub == "huggingface":
|
|
111
|
+
from huggingface_hub import snapshot_download
|
|
112
|
+
from huggingface_hub.constants import HF_HUB_CACHE
|
|
113
|
+
from huggingface_hub.errors import LocalEntryNotFoundError
|
|
114
|
+
|
|
115
|
+
# Determine cache directory for this model
|
|
116
|
+
cache_dir = Path(HF_HUB_CACHE) / f'models--{model_name_or_path.replace("/", "--")}'
|
|
117
|
+
|
|
118
|
+
# Check if we have a valid cached version
|
|
119
|
+
if _is_cache_valid(cache_dir):
|
|
120
|
+
# Return the snapshot path (latest version)
|
|
121
|
+
snapshots_dir = cache_dir / "snapshots"
|
|
122
|
+
if snapshots_dir.exists():
|
|
123
|
+
snapshot_dirs = [d for d in snapshots_dir.iterdir() if d.is_dir()]
|
|
124
|
+
if snapshot_dirs:
|
|
125
|
+
# Return the most recent snapshot
|
|
126
|
+
latest_snapshot = max(snapshot_dirs, key=lambda p: p.stat().st_mtime)
|
|
127
|
+
return str(latest_snapshot)
|
|
113
128
|
|
|
114
|
-
try:
|
|
115
|
-
downloaded_path = snapshot_download(repo_id=model_name_or_path, repo_type="model")
|
|
116
|
-
_create_cache_marker(cache_dir)
|
|
117
|
-
return downloaded_path
|
|
118
|
-
except LocalEntryNotFoundError:
|
|
119
129
|
try:
|
|
120
|
-
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
|
|
121
130
|
downloaded_path = snapshot_download(repo_id=model_name_or_path, repo_type="model")
|
|
122
131
|
_create_cache_marker(cache_dir)
|
|
123
132
|
return downloaded_path
|
|
124
|
-
except
|
|
125
|
-
|
|
126
|
-
|
|
133
|
+
except LocalEntryNotFoundError:
|
|
134
|
+
# Fall back to modelscope if HF entry not found
|
|
135
|
+
try:
|
|
136
|
+
from modelscope.hub.snapshot_download import snapshot_download as ms_snapshot
|
|
137
|
+
|
|
138
|
+
downloaded_path = ms_snapshot(model_name_or_path)
|
|
139
|
+
return downloaded_path
|
|
140
|
+
except Exception as e: # pragma: no cover - bubble up for caller context
|
|
141
|
+
raise ModelLoadError(model_name_or_path, original_error=e)
|
|
142
|
+
except Exception as e: # pragma: no cover - unexpected download issue
|
|
143
|
+
import colorful
|
|
144
|
+
|
|
145
|
+
print(colorful.red | f"Error downloading from Hugging Face Hub: {e}. Trying ModelScope...")
|
|
146
|
+
from modelscope.hub.snapshot_download import snapshot_download as ms_snapshot
|
|
147
|
+
|
|
148
|
+
downloaded_path = ms_snapshot(model_name_or_path)
|
|
149
|
+
return downloaded_path
|
|
150
|
+
|
|
151
|
+
# modelscope path
|
|
152
|
+
from modelscope.hub.snapshot_download import snapshot_download as ms_snapshot
|
|
153
|
+
|
|
154
|
+
try:
|
|
155
|
+
downloaded_path = ms_snapshot(model_name_or_path)
|
|
156
|
+
return downloaded_path
|
|
157
|
+
except Exception as e: # pragma: no cover
|
|
127
158
|
raise ModelLoadError(model_name_or_path, original_error=e)
|
|
128
159
|
|
|
129
160
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: lattifai
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: Lattifai Python SDK: Seamless Integration with Lattifai's Speech and Video AI Services
|
|
5
5
|
Author-email: Lattifai Technologies <tech@lattifai.com>
|
|
6
6
|
Maintainer-email: Lattice <tech@lattifai.com>
|
|
@@ -50,7 +50,7 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
50
50
|
Requires-Python: <3.15,>=3.10
|
|
51
51
|
Description-Content-Type: text/markdown
|
|
52
52
|
License-File: LICENSE
|
|
53
|
-
Requires-Dist: lattifai-core>=0.
|
|
53
|
+
Requires-Dist: lattifai-core>=0.6.0
|
|
54
54
|
Requires-Dist: lattifai-run>=1.0.1
|
|
55
55
|
Requires-Dist: python-dotenv
|
|
56
56
|
Requires-Dist: lhotse>=1.26.0
|
|
@@ -61,11 +61,12 @@ Requires-Dist: tgt
|
|
|
61
61
|
Requires-Dist: onnx>=1.16.0
|
|
62
62
|
Requires-Dist: onnxruntime
|
|
63
63
|
Requires-Dist: msgpack
|
|
64
|
+
Requires-Dist: scipy!=1.16.3
|
|
64
65
|
Requires-Dist: g2p-phonemizer>=0.4.0
|
|
65
66
|
Requires-Dist: av
|
|
66
67
|
Requires-Dist: wtpsplit>=2.1.7
|
|
67
68
|
Requires-Dist: OmniSenseVoice>=0.4.2
|
|
68
|
-
Requires-Dist: nemo_toolkit_asr[asr]>=2.7.
|
|
69
|
+
Requires-Dist: nemo_toolkit_asr[asr]>=2.7.0rc4
|
|
69
70
|
Requires-Dist: pyannote-audio-notorchdeps>=4.0.2
|
|
70
71
|
Requires-Dist: questionary>=2.0
|
|
71
72
|
Requires-Dist: yt-dlp
|
|
@@ -113,9 +114,6 @@ Dynamic: license-file
|
|
|
113
114
|
|
|
114
115
|
Advanced forced alignment and subtitle generation powered by [ 🤗 Lattice-1](https://huggingface.co/Lattifai/Lattice-1) model.
|
|
115
116
|
|
|
116
|
-
> **⚠️ Note on Current Limitations**:
|
|
117
|
-
> 1. **Memory Usage**: We are aware of high memory consumption and are actively working on further optimizations.
|
|
118
|
-
|
|
119
117
|
## Table of Contents
|
|
120
118
|
|
|
121
119
|
- [Installation](#installation)
|
|
@@ -136,7 +134,7 @@ Advanced forced alignment and subtitle generation powered by [ 🤗 Lattice-1](h
|
|
|
136
134
|
- [Advanced Features](#advanced-features)
|
|
137
135
|
- [Word-Level Alignment](#word-level-alignment)
|
|
138
136
|
- [Smart Sentence Splitting](#smart-sentence-splitting)
|
|
139
|
-
- [Speaker Diarization](#speaker-diarization
|
|
137
|
+
- [Speaker Diarization](#speaker-diarization)
|
|
140
138
|
- [YAML Configuration Files](#yaml-configuration-files)
|
|
141
139
|
- [Supported Formats](#supported-formats)
|
|
142
140
|
- [Roadmap](#roadmap)
|
|
@@ -708,9 +706,7 @@ caption = client.alignment(
|
|
|
708
706
|
)
|
|
709
707
|
```
|
|
710
708
|
|
|
711
|
-
### Speaker Diarization
|
|
712
|
-
|
|
713
|
-
**Note:** This feature is currently under development and not yet fully available.
|
|
709
|
+
### Speaker Diarization
|
|
714
710
|
|
|
715
711
|
Speaker diarization automatically identifies and labels different speakers in audio. When enabled, the system will:
|
|
716
712
|
- Detect speaker changes in the audio
|
|
@@ -721,6 +717,24 @@ Speaker diarization automatically identifies and labels different speakers in au
|
|
|
721
717
|
- **Existing speaker labels in subtitles**: If your input captions already contain speaker names (e.g., `[Alice]`, `>> Bob:`, or `SPEAKER_01:`), the system will preserve them as much as possible during alignment
|
|
722
718
|
- **Gemini Transcriber**: When using Gemini models for transcription (e.g., `gemini-2.5-pro`), the model can intelligently identify and extract speaker names from dialogue context, making it easier to generate speaker-aware transcripts
|
|
723
719
|
|
|
720
|
+
**CLI:**
|
|
721
|
+
```bash
|
|
722
|
+
# Enable speaker diarization during alignment
|
|
723
|
+
lai alignment align audio.wav subtitle.srt output.srt \
|
|
724
|
+
diarization.enabled=true
|
|
725
|
+
|
|
726
|
+
# With additional diarization settings
|
|
727
|
+
lai alignment align audio.wav subtitle.srt output.srt \
|
|
728
|
+
diarization.enabled=true \
|
|
729
|
+
diarization.device=cuda \
|
|
730
|
+
diarization.min_speakers=2 \
|
|
731
|
+
diarization.max_speakers=4
|
|
732
|
+
|
|
733
|
+
# For YouTube videos with diarization
|
|
734
|
+
lai alignment youtube "https://youtube.com/watch?v=VIDEO_ID" \
|
|
735
|
+
diarization.enabled=true
|
|
736
|
+
```
|
|
737
|
+
|
|
724
738
|
**Python SDK:**
|
|
725
739
|
```python
|
|
726
740
|
from lattifai import LattifAI, DiarizationConfig
|
|
@@ -1,44 +1,45 @@
|
|
|
1
1
|
lattifai/__init__.py,sha256=7y1R5IGw0Sgvl1tfqxEK7e-ozW0wVB-q_JZgv6YyrMQ,2751
|
|
2
2
|
lattifai/audio2.py,sha256=BKMCzkuEmBFAWOEnzgLxeK8TBPTFbjzr1esOfe3MQoo,17460
|
|
3
|
-
lattifai/client.py,sha256=
|
|
3
|
+
lattifai/client.py,sha256=yD8yIcw3xfvPqBzUQTJj5h9sFYbg4BSHBR5gwO-wGF8,18875
|
|
4
4
|
lattifai/errors.py,sha256=LyWRGVhQ6Ak2CYn9FBYAPRgQ7_VHpxzNsXI31HXD--s,11291
|
|
5
5
|
lattifai/logging.py,sha256=MbUEeOUFlF92pA9v532DiPPWKl03S7UHCJ6Z652cf0w,2860
|
|
6
|
-
lattifai/mixin.py,sha256=
|
|
6
|
+
lattifai/mixin.py,sha256=t8A3J1DVr7SBEv2FYK3JrGEFqxAV3NktfGPiOQdxTB4,24990
|
|
7
7
|
lattifai/types.py,sha256=SjYBfwrCBOXlICvH04niFQJ7OzTx7oTaa_npfRkB67U,659
|
|
8
|
-
lattifai/utils.py,sha256=
|
|
8
|
+
lattifai/utils.py,sha256=N4k2cvoT2CgHsGY0sb-3l-eKUMLzGirPzVa53v8euh4,6658
|
|
9
9
|
lattifai/alignment/__init__.py,sha256=ehpkKfjNIYUx7_M-RWD_8Efcrzd9bE-NSm0QgMMVLW0,178
|
|
10
|
-
lattifai/alignment/lattice1_aligner.py,sha256=
|
|
11
|
-
lattifai/alignment/lattice1_worker.py,sha256=
|
|
10
|
+
lattifai/alignment/lattice1_aligner.py,sha256=5c_bWv0v6jOX8H4gO3-mAKXn4Ux4vjHIUtAY4d8yMak,6110
|
|
11
|
+
lattifai/alignment/lattice1_worker.py,sha256=U_EbDsbisWWKIyfxtx9q4deAyml_hni8DpznZo5zBko,12975
|
|
12
12
|
lattifai/alignment/phonemizer.py,sha256=fbhN2DOl39lW4nQWKzyUUTMUabg7v61lB1kj8SKK-Sw,1761
|
|
13
13
|
lattifai/alignment/segmenter.py,sha256=mzWEQC6hWZtI2mR2WU59W7qLHa7KXy7fdU6991kyUuQ,6276
|
|
14
|
-
lattifai/alignment/tokenizer.py,sha256=
|
|
14
|
+
lattifai/alignment/tokenizer.py,sha256=GNLZbkvZ066PJGUznJVgxZUwSzslD6mz8YsI2Cry6RI,22400
|
|
15
15
|
lattifai/caption/__init__.py,sha256=6MM_2j6CaqwZ81LfSy4di2EP0ykvheRjMZKAYDx2rQs,477
|
|
16
|
-
lattifai/caption/caption.py,sha256=
|
|
16
|
+
lattifai/caption/caption.py,sha256=mZYobxuZ8tkJUkZMVvRTrNeGTdmIZYSXTEySQdaGQd8,54595
|
|
17
17
|
lattifai/caption/gemini_reader.py,sha256=GqY2w78xGYCMDP5kD5WGS8jK0gntel2SK-EPpPKTrwU,15138
|
|
18
18
|
lattifai/caption/gemini_writer.py,sha256=sYPxYEmVQcEan5WVGgSrcraxs3QJRQRh8CJkl2yUQ1s,6515
|
|
19
19
|
lattifai/caption/supervision.py,sha256=DRrM8lfKU_x9aVBcLG6xnT0xIJrnc8jzHpzcSwQOg8c,905
|
|
20
20
|
lattifai/caption/text_parser.py,sha256=XDb8KTt031uJ1hg6dpbINglGOTX-6pBcghbg3DULM1I,4633
|
|
21
|
-
lattifai/cli/__init__.py,sha256=
|
|
21
|
+
lattifai/cli/__init__.py,sha256=LafsAf8YfDcfTeJ1IevFcyLm-mNbxpOOnm33OFKtpDM,523
|
|
22
22
|
lattifai/cli/alignment.py,sha256=06em-Uaf6NhSz1ce4dwT2r8n56NrtibR7ZsSkmc18Kc,5954
|
|
23
23
|
lattifai/cli/app_installer.py,sha256=gAndH3Yo97fGRDe2CQnGtOgZZ4k3_v5ftcUo5g6xbSA,5884
|
|
24
|
-
lattifai/cli/caption.py,sha256=
|
|
24
|
+
lattifai/cli/caption.py,sha256=4qQ9DFhxcfaeFMY0TB5I42x4W_gOo2zY6kjXnHnFDms,6313
|
|
25
|
+
lattifai/cli/diarization.py,sha256=cDz1p6RUC-ySPzzGYHWff0L6EWHTUPrci7DaVxwZrVc,3933
|
|
25
26
|
lattifai/cli/server.py,sha256=sXMfOSse9-V79slXUU8FDLeqtI5U9zeU-5YpjTIGyVw,1186
|
|
26
|
-
lattifai/cli/transcribe.py,sha256=
|
|
27
|
+
lattifai/cli/transcribe.py,sha256=_vHzrdaGiPepQWATqvEDYDjwzfVLAd2i8RjOLkvdb0w,8218
|
|
27
28
|
lattifai/cli/youtube.py,sha256=-EIDSS1Iel3_6qD9M2CZZHwKOvgdkIa1cMY4rX7xwVo,5331
|
|
28
29
|
lattifai/config/__init__.py,sha256=Z8OudvS6fgfLNLu_2fvoXartQiYCECOnNfzDt-PfCN4,543
|
|
29
|
-
lattifai/config/alignment.py,sha256=
|
|
30
|
+
lattifai/config/alignment.py,sha256=vLiH150YWvBUiVkFOIO-nPXCB-b8fP9iSZgS79k1Qbg,4586
|
|
30
31
|
lattifai/config/caption.py,sha256=AYOyUJ1xZsX8CBZy3GpLitbcCAHcZ9LwXui_v3vtuso,6786
|
|
31
32
|
lattifai/config/client.py,sha256=I1JqLQlsQNU5ouovTumr-PP_8GWC9DI_e9B5UwsDZws,1492
|
|
32
33
|
lattifai/config/diarization.py,sha256=cIkwCfsYqfMns3i6tKWcwBBBkdnhhmB_Eo0TuOPCw9o,2484
|
|
33
34
|
lattifai/config/media.py,sha256=cjM8eGeZ7ELhmy4cCqHAyogeHItaVqMrPzSwwIx79HY,14856
|
|
34
|
-
lattifai/config/transcription.py,sha256=
|
|
35
|
+
lattifai/config/transcription.py,sha256=_gPJD6cob_jWNdf841nBHhAqJGCxS6PfSyvx2W_vPcM,3082
|
|
35
36
|
lattifai/diarization/__init__.py,sha256=MgBDQ1ehL2qDnZprEp8KqON7CmbG-qaP37gzBsV0jzk,119
|
|
36
|
-
lattifai/diarization/lattifai.py,sha256=
|
|
37
|
+
lattifai/diarization/lattifai.py,sha256=tCnFL6ywITqeKR8YoCsYvyJxNoIwoC6GsnI9zkXNB-Q,3128
|
|
37
38
|
lattifai/server/app.py,sha256=wXYgXc_yGQACtUJdhkfhLsTOQjhhIhDQRiVRny7Ogcs,15455
|
|
38
|
-
lattifai/transcription/__init__.py,sha256=
|
|
39
|
-
lattifai/transcription/base.py,sha256=
|
|
40
|
-
lattifai/transcription/gemini.py,sha256=
|
|
41
|
-
lattifai/transcription/lattifai.py,sha256=
|
|
39
|
+
lattifai/transcription/__init__.py,sha256=vMHciyCEPKhhfM3KjMCeDqnyxU1oghF8g5o5SvpnT_4,2669
|
|
40
|
+
lattifai/transcription/base.py,sha256=v_b1_JGYiBqeMmwns0wHCJ7UOm6j9k-76Uzbr-qmzrs,4467
|
|
41
|
+
lattifai/transcription/gemini.py,sha256=LJSQt9nGqQdEG6ZFXoHWltumyMEM7-Ezy8ss0iPJb7k,12414
|
|
42
|
+
lattifai/transcription/lattifai.py,sha256=EKEdCafgdRWKw_084eD07BqGh2_D-qo3ig3H5X3XYGg,4621
|
|
42
43
|
lattifai/transcription/prompts/README.md,sha256=X49KWSQVdjWxxWUp4R2w3ZqKrAOi6_kDNHh1hMaQ4PE,694
|
|
43
44
|
lattifai/transcription/prompts/__init__.py,sha256=G9b42COaCYv3sPPNkHsGDLOMBuVGKt4mXGYal_BYtYQ,1351
|
|
44
45
|
lattifai/transcription/prompts/gemini/README.md,sha256=rt7f7yDGtaobKBo95LG3u56mqa3ABOXQd0UVgJYtYuo,781
|
|
@@ -48,9 +49,9 @@ lattifai/workflow/agents.py,sha256=yEOnxnhcTvr1iOhCorNvp8B76P6nQsLRXJCu_rCYFfM,3
|
|
|
48
49
|
lattifai/workflow/base.py,sha256=8QoVIBZwJfr5mppJbtUFafHv5QR9lL-XrULjTWD0oBg,6257
|
|
49
50
|
lattifai/workflow/file_manager.py,sha256=IUWW838ta83kfwM4gpW83gsD_Tx-pa-L_RWKjiefQbQ,33017
|
|
50
51
|
lattifai/workflow/youtube.py,sha256=ON9z0UUk16ThQzdhdgyOiwBmewZOcxfT05dsl3aKYqw,23840
|
|
51
|
-
lattifai-1.0.
|
|
52
|
-
lattifai-1.0.
|
|
53
|
-
lattifai-1.0.
|
|
54
|
-
lattifai-1.0.
|
|
55
|
-
lattifai-1.0.
|
|
56
|
-
lattifai-1.0.
|
|
52
|
+
lattifai-1.1.0.dist-info/licenses/LICENSE,sha256=xGMLmdFJy6Jkz3Hd0znyQLmcxC93FSZB5isKnEDMoQQ,1066
|
|
53
|
+
lattifai-1.1.0.dist-info/METADATA,sha256=MXg4IXWA38Y9c_RT9S_6NnCZFc_4-Yic_zZDHlS1TeY,26400
|
|
54
|
+
lattifai-1.1.0.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
|
|
55
|
+
lattifai-1.1.0.dist-info/entry_points.txt,sha256=nHZri2VQkPYEl0tQ0dMYTpVGlCOgVWlDG_JtDR3QXF8,545
|
|
56
|
+
lattifai-1.1.0.dist-info/top_level.txt,sha256=tHSoXF26r-IGfbIP_JoYATqbmf14h5NrnNJGH4j5reI,9
|
|
57
|
+
lattifai-1.1.0.dist-info/RECORD,,
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
[console_scripts]
|
|
2
2
|
lai-align = lattifai.cli.alignment:main
|
|
3
3
|
lai-app-install = lattifai.cli.app_installer:main
|
|
4
|
+
lai-diarize = lattifai.cli.diarization:main
|
|
4
5
|
lai-server = lattifai.cli.server:main
|
|
5
6
|
lai-transcribe = lattifai.cli.transcribe:main
|
|
6
7
|
lai-youtube = lattifai.cli.youtube:main
|
|
@@ -11,4 +12,5 @@ laicap-shift = lattifai.cli.caption:main_shift
|
|
|
11
12
|
[lai_run.cli]
|
|
12
13
|
alignment = lattifai.cli
|
|
13
14
|
caption = lattifai.cli
|
|
15
|
+
diarization = lattifai.cli
|
|
14
16
|
transcribe = lattifai.cli
|
|
File without changes
|
|
File without changes
|
|
File without changes
|