lattifai 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/_init.py +20 -0
- lattifai/alignment/__init__.py +9 -1
- lattifai/alignment/lattice1_aligner.py +175 -54
- lattifai/alignment/lattice1_worker.py +47 -4
- lattifai/alignment/punctuation.py +38 -0
- lattifai/alignment/segmenter.py +3 -2
- lattifai/alignment/text_align.py +441 -0
- lattifai/alignment/tokenizer.py +134 -65
- lattifai/audio2.py +162 -183
- lattifai/cli/__init__.py +2 -1
- lattifai/cli/alignment.py +5 -0
- lattifai/cli/caption.py +111 -4
- lattifai/cli/transcribe.py +2 -6
- lattifai/cli/youtube.py +7 -1
- lattifai/client.py +72 -123
- lattifai/config/__init__.py +28 -0
- lattifai/config/alignment.py +14 -0
- lattifai/config/caption.py +45 -31
- lattifai/config/client.py +16 -0
- lattifai/config/event.py +102 -0
- lattifai/config/media.py +20 -0
- lattifai/config/transcription.py +25 -1
- lattifai/data/__init__.py +8 -0
- lattifai/data/caption.py +228 -0
- lattifai/diarization/__init__.py +41 -1
- lattifai/errors.py +78 -53
- lattifai/event/__init__.py +65 -0
- lattifai/event/lattifai.py +166 -0
- lattifai/mixin.py +49 -32
- lattifai/transcription/base.py +8 -2
- lattifai/transcription/gemini.py +147 -16
- lattifai/transcription/lattifai.py +25 -63
- lattifai/types.py +1 -1
- lattifai/utils.py +7 -13
- lattifai/workflow/__init__.py +28 -4
- lattifai/workflow/file_manager.py +2 -5
- lattifai/youtube/__init__.py +43 -0
- lattifai/youtube/client.py +1265 -0
- lattifai/youtube/types.py +23 -0
- lattifai-1.3.0.dist-info/METADATA +678 -0
- lattifai-1.3.0.dist-info/RECORD +57 -0
- {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/entry_points.txt +1 -2
- lattifai/__init__.py +0 -88
- lattifai/alignment/sentence_splitter.py +0 -219
- lattifai/caption/__init__.py +0 -20
- lattifai/caption/caption.py +0 -1467
- lattifai/caption/gemini_reader.py +0 -462
- lattifai/caption/gemini_writer.py +0 -173
- lattifai/caption/supervision.py +0 -34
- lattifai/caption/text_parser.py +0 -145
- lattifai/cli/app_installer.py +0 -142
- lattifai/cli/server.py +0 -44
- lattifai/server/app.py +0 -427
- lattifai/workflow/youtube.py +0 -577
- lattifai-1.2.1.dist-info/METADATA +0 -1134
- lattifai-1.2.1.dist-info/RECORD +0 -58
- {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/WHEEL +0 -0
- {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/top_level.txt +0 -0
lattifai/errors.py
CHANGED
|
@@ -1,10 +1,42 @@
|
|
|
1
1
|
"""Error handling and exception classes for LattifAI SDK."""
|
|
2
2
|
|
|
3
|
+
import functools
|
|
3
4
|
import traceback
|
|
4
5
|
from typing import Any, Dict, Optional
|
|
5
6
|
|
|
6
7
|
import colorful
|
|
7
8
|
|
|
9
|
+
|
|
10
|
+
def format_exception(e: "LattifAIError") -> str:
|
|
11
|
+
"""Format LattifAIError with filtered traceback (only lattifai frames)."""
|
|
12
|
+
tb_lines = traceback.format_exception(type(e), e, e.__traceback__)
|
|
13
|
+
filtered = []
|
|
14
|
+
skip_next_code_line = False
|
|
15
|
+
|
|
16
|
+
for i, line in enumerate(tb_lines):
|
|
17
|
+
if skip_next_code_line:
|
|
18
|
+
skip_next_code_line = False
|
|
19
|
+
continue
|
|
20
|
+
|
|
21
|
+
if line.startswith("Traceback") or not line.startswith(" File"):
|
|
22
|
+
filtered.append(line)
|
|
23
|
+
elif "lattifai" in line:
|
|
24
|
+
filtered.append(line)
|
|
25
|
+
if i + 1 < len(tb_lines) and tb_lines[i + 1].startswith(" "):
|
|
26
|
+
filtered.append(tb_lines[i + 1])
|
|
27
|
+
skip_next_code_line = True
|
|
28
|
+
elif i + 1 < len(tb_lines) and tb_lines[i + 1].startswith(" "):
|
|
29
|
+
skip_next_code_line = True
|
|
30
|
+
|
|
31
|
+
return "".join(filtered)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _merge_context(kwargs: Dict[str, Any], updates: Dict[str, Any]) -> None:
|
|
35
|
+
"""Merge updates into kwargs['context'], creating it if needed."""
|
|
36
|
+
context = kwargs.setdefault("context", {})
|
|
37
|
+
context.update(updates)
|
|
38
|
+
|
|
39
|
+
|
|
8
40
|
# Error help messages
|
|
9
41
|
LATTICE_DECODING_FAILURE_HELP = (
|
|
10
42
|
"Failed to decode lattice alignment. Possible reasons:\n\n"
|
|
@@ -76,10 +108,8 @@ class AudioProcessingError(LattifAIError):
|
|
|
76
108
|
"""Error during audio processing operations."""
|
|
77
109
|
|
|
78
110
|
def __init__(self, message: str, media_path: Optional[str] = None, **kwargs):
|
|
79
|
-
context = kwargs.get("context", {})
|
|
80
111
|
if media_path:
|
|
81
|
-
|
|
82
|
-
kwargs["context"] = context
|
|
112
|
+
_merge_context(kwargs, {"media_path": media_path})
|
|
83
113
|
super().__init__(message, **kwargs)
|
|
84
114
|
|
|
85
115
|
|
|
@@ -90,11 +120,9 @@ class AudioLoadError(AudioProcessingError):
|
|
|
90
120
|
message = f"Failed to load audio file: {colorful.red(media_path)}"
|
|
91
121
|
if original_error:
|
|
92
122
|
message += f" - {colorful.red(str(original_error))}"
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
kwargs["context"] = context
|
|
97
|
-
|
|
123
|
+
_merge_context(
|
|
124
|
+
kwargs, {"media_path": media_path, "original_error": str(original_error) if original_error else None}
|
|
125
|
+
)
|
|
98
126
|
super().__init__(message, media_path=media_path, **kwargs)
|
|
99
127
|
|
|
100
128
|
|
|
@@ -103,9 +131,7 @@ class AudioFormatError(AudioProcessingError):
|
|
|
103
131
|
|
|
104
132
|
def __init__(self, media_path: str, format_issue: str, **kwargs):
|
|
105
133
|
message = f"Audio format error for {colorful.red(media_path)}: {colorful.red(format_issue)}"
|
|
106
|
-
|
|
107
|
-
context.update({"media_path": media_path, "format_issue": format_issue})
|
|
108
|
-
kwargs["context"] = context
|
|
134
|
+
_merge_context(kwargs, {"media_path": media_path, "format_issue": format_issue})
|
|
109
135
|
super().__init__(message, media_path=media_path, **kwargs)
|
|
110
136
|
|
|
111
137
|
|
|
@@ -113,10 +139,8 @@ class CaptionProcessingError(LattifAIError):
|
|
|
113
139
|
"""Error during caption/text processing operations."""
|
|
114
140
|
|
|
115
141
|
def __init__(self, message: str, caption_path: Optional[str] = None, **kwargs):
|
|
116
|
-
context = kwargs.get("context", {})
|
|
117
142
|
if caption_path:
|
|
118
|
-
|
|
119
|
-
kwargs["context"] = context
|
|
143
|
+
_merge_context(kwargs, {"caption_path": caption_path})
|
|
120
144
|
super().__init__(message, **kwargs)
|
|
121
145
|
|
|
122
146
|
|
|
@@ -125,9 +149,7 @@ class CaptionParseError(CaptionProcessingError):
|
|
|
125
149
|
|
|
126
150
|
def __init__(self, caption_path: str, parse_issue: str, **kwargs):
|
|
127
151
|
message = f"Failed to parse caption file {caption_path}: {parse_issue}"
|
|
128
|
-
|
|
129
|
-
context.update({"caption_path": caption_path, "parse_issue": parse_issue})
|
|
130
|
-
kwargs["context"] = context
|
|
152
|
+
_merge_context(kwargs, {"caption_path": caption_path, "parse_issue": parse_issue})
|
|
131
153
|
super().__init__(message, caption_path=caption_path, **kwargs)
|
|
132
154
|
|
|
133
155
|
|
|
@@ -135,12 +157,13 @@ class AlignmentError(LattifAIError):
|
|
|
135
157
|
"""Error during audio-text alignment process."""
|
|
136
158
|
|
|
137
159
|
def __init__(self, message: str, media_path: Optional[str] = None, caption_path: Optional[str] = None, **kwargs):
|
|
138
|
-
|
|
160
|
+
updates = {}
|
|
139
161
|
if media_path:
|
|
140
|
-
|
|
162
|
+
updates["media_path"] = media_path
|
|
141
163
|
if caption_path:
|
|
142
|
-
|
|
143
|
-
|
|
164
|
+
updates["caption_path"] = caption_path
|
|
165
|
+
if updates:
|
|
166
|
+
_merge_context(kwargs, updates)
|
|
144
167
|
super().__init__(message, **kwargs)
|
|
145
168
|
|
|
146
169
|
|
|
@@ -151,36 +174,44 @@ class LatticeEncodingError(AlignmentError):
|
|
|
151
174
|
message = "Failed to generate lattice graph from text"
|
|
152
175
|
if original_error:
|
|
153
176
|
message += f": {colorful.red(str(original_error))}"
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
177
|
+
text_preview = text_content[:100] + "..." if len(text_content) > 100 else text_content
|
|
178
|
+
_merge_context(
|
|
179
|
+
kwargs,
|
|
157
180
|
{
|
|
158
181
|
"text_content_length": len(text_content),
|
|
159
|
-
"text_preview":
|
|
182
|
+
"text_preview": text_preview,
|
|
160
183
|
"original_error": str(original_error) if original_error else None,
|
|
161
|
-
}
|
|
184
|
+
},
|
|
162
185
|
)
|
|
163
|
-
kwargs["context"] = context
|
|
164
186
|
super().__init__(message, **kwargs)
|
|
165
187
|
|
|
166
188
|
|
|
167
189
|
class LatticeDecodingError(AlignmentError):
|
|
168
190
|
"""Error decoding lattice alignment results."""
|
|
169
191
|
|
|
170
|
-
def __init__(
|
|
171
|
-
|
|
192
|
+
def __init__(
|
|
193
|
+
self,
|
|
194
|
+
lattice_id: str,
|
|
195
|
+
message: Optional[str] = None,
|
|
196
|
+
original_error: Optional[Exception] = None,
|
|
197
|
+
skip_help: bool = False,
|
|
198
|
+
**kwargs,
|
|
199
|
+
):
|
|
200
|
+
message = message or f"Failed to decode lattice alignment results for lattice ID: {colorful.red(lattice_id)}"
|
|
172
201
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
202
|
+
error_str = str(original_error) if original_error else None
|
|
203
|
+
is_help_message = error_str == LATTICE_DECODING_FAILURE_HELP
|
|
204
|
+
|
|
205
|
+
if original_error and not is_help_message:
|
|
206
|
+
message += f" - {colorful.red(error_str)}"
|
|
207
|
+
|
|
208
|
+
context_updates = {"lattice_id": lattice_id}
|
|
209
|
+
if original_error and not is_help_message:
|
|
210
|
+
context_updates["original_error"] = error_str
|
|
211
|
+
_merge_context(kwargs, context_updates)
|
|
176
212
|
|
|
177
|
-
context = kwargs.get("context", {})
|
|
178
|
-
# Don't store the entire help message in context to avoid duplication
|
|
179
|
-
if original_error and str(original_error) != LATTICE_DECODING_FAILURE_HELP:
|
|
180
|
-
context["original_error"] = str(original_error)
|
|
181
|
-
context["lattice_id"] = lattice_id
|
|
182
|
-
kwargs["context"] = context
|
|
183
213
|
super().__init__(message, **kwargs)
|
|
214
|
+
self.skip_help = skip_help
|
|
184
215
|
|
|
185
216
|
def get_message(self) -> str:
|
|
186
217
|
"""Return formatted error message with help text."""
|
|
@@ -188,8 +219,9 @@ class LatticeDecodingError(AlignmentError):
|
|
|
188
219
|
if self.context and self.context.get("lattice_id"):
|
|
189
220
|
# Only show essential context (lattice_id), not the duplicated help message
|
|
190
221
|
base_message += f'\n{colorful.yellow("Lattice ID:")} {self.context["lattice_id"]}'
|
|
191
|
-
# Append help message
|
|
192
|
-
|
|
222
|
+
# Append help message only if not skipped (e.g., when anomaly info is provided)
|
|
223
|
+
if not self.skip_help:
|
|
224
|
+
base_message += f"\n\n{colorful.yellow(LATTICE_DECODING_FAILURE_HELP)}"
|
|
193
225
|
return base_message
|
|
194
226
|
|
|
195
227
|
|
|
@@ -200,10 +232,9 @@ class ModelLoadError(LattifAIError):
|
|
|
200
232
|
message = f"Failed to load model: {colorful.red(model_name)}"
|
|
201
233
|
if original_error:
|
|
202
234
|
message += f" - {colorful.red(str(original_error))}"
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
kwargs["context"] = context
|
|
235
|
+
_merge_context(
|
|
236
|
+
kwargs, {"model_name": model_name, "original_error": str(original_error) if original_error else None}
|
|
237
|
+
)
|
|
207
238
|
super().__init__(message, **kwargs)
|
|
208
239
|
|
|
209
240
|
|
|
@@ -214,10 +245,7 @@ class DependencyError(LattifAIError):
|
|
|
214
245
|
message = f"Missing required dependency: {colorful.red(dependency_name)}"
|
|
215
246
|
if install_command:
|
|
216
247
|
message += f"\nPlease install it using: {colorful.yellow(install_command)}"
|
|
217
|
-
|
|
218
|
-
context = kwargs.get("context", {})
|
|
219
|
-
context.update({"dependency_name": dependency_name, "install_command": install_command})
|
|
220
|
-
kwargs["context"] = context
|
|
248
|
+
_merge_context(kwargs, {"dependency_name": dependency_name, "install_command": install_command})
|
|
221
249
|
super().__init__(message, **kwargs)
|
|
222
250
|
|
|
223
251
|
|
|
@@ -225,9 +253,7 @@ class APIError(LattifAIError):
|
|
|
225
253
|
"""Error communicating with LattifAI API."""
|
|
226
254
|
|
|
227
255
|
def __init__(self, message: str, status_code: Optional[int] = None, response_text: Optional[str] = None, **kwargs):
|
|
228
|
-
|
|
229
|
-
context.update({"status_code": status_code, "response_text": response_text})
|
|
230
|
-
kwargs["context"] = context
|
|
256
|
+
_merge_context(kwargs, {"status_code": status_code, "response_text": response_text})
|
|
231
257
|
super().__init__(message, **kwargs)
|
|
232
258
|
|
|
233
259
|
|
|
@@ -249,14 +275,13 @@ class QuotaExceededError(APIError):
|
|
|
249
275
|
def handle_exception(func):
|
|
250
276
|
"""Decorator to handle exceptions and convert them to LattifAI errors."""
|
|
251
277
|
|
|
278
|
+
@functools.wraps(func)
|
|
252
279
|
def wrapper(*args, **kwargs):
|
|
253
280
|
try:
|
|
254
281
|
return func(*args, **kwargs)
|
|
255
282
|
except LattifAIError:
|
|
256
|
-
# Re-raise LattifAI errors as-is
|
|
257
283
|
raise
|
|
258
284
|
except Exception as e:
|
|
259
|
-
# Convert other exceptions to LattifAI errors
|
|
260
285
|
error_msg = f"Unexpected error in {func.__name__}: {str(e)}"
|
|
261
286
|
context = {
|
|
262
287
|
"function": func.__name__,
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Audio Event Detection module for LattifAI.
|
|
2
|
+
|
|
3
|
+
This module provides audio event detection capabilities, it can identify various
|
|
4
|
+
audio events including speech, music, singing, and demographic characteristics
|
|
5
|
+
(male, female, child voices).
|
|
6
|
+
|
|
7
|
+
Key Components:
|
|
8
|
+
LattifAIEventDetector: Main class that wraps lattifai_core's
|
|
9
|
+
EventDetector for seamless integration with LattifAI workflows.
|
|
10
|
+
|
|
11
|
+
Features:
|
|
12
|
+
- Multi-class audio event detection (30+ reduced classes or 400+ full classes)
|
|
13
|
+
- Voice Activity Detection (VAD) for speech segmentation
|
|
14
|
+
- Gender/age classification for speech segments
|
|
15
|
+
- Configurable detection thresholds and top-k filtering
|
|
16
|
+
- Support for both bundled and custom pretrained models
|
|
17
|
+
|
|
18
|
+
Detected Event Types:
|
|
19
|
+
- Speech: General speech activity
|
|
20
|
+
- Male/Female/Child: Speaker demographic classification
|
|
21
|
+
- Music: Musical content detection
|
|
22
|
+
- Singing: Vocal music detection
|
|
23
|
+
- Synthetic: Synthetic/electronic sounds
|
|
24
|
+
|
|
25
|
+
Configuration:
|
|
26
|
+
Use EventConfig to control:
|
|
27
|
+
- enabled: Whether to run audio event detection
|
|
28
|
+
- device: GPU/CPU device selection
|
|
29
|
+
- dtype: Model precision (float32, float16, bfloat16)
|
|
30
|
+
- reduced: Use reduced label set (33 vs 400+ classes)
|
|
31
|
+
- top_k: Number of top event classes to detect
|
|
32
|
+
- vad_chunk_size/vad_max_gap: VAD segmentation parameters
|
|
33
|
+
|
|
34
|
+
Example:
|
|
35
|
+
>>> from lattifai.event import LattifAIEventDetector
|
|
36
|
+
>>> from lattifai.config import EventConfig
|
|
37
|
+
>>> from lattifai.audio2 import AudioLoader
|
|
38
|
+
>>>
|
|
39
|
+
>>> config = EventConfig(enabled=True, device="cuda")
|
|
40
|
+
>>> detector = LattifAIEventDetector(config)
|
|
41
|
+
>>>
|
|
42
|
+
>>> audio = AudioLoader.load("speech.wav")
|
|
43
|
+
>>> result = detector.detect(audio)
|
|
44
|
+
>>>
|
|
45
|
+
>>> # Access VAD segments directly
|
|
46
|
+
>>> for start, end in result.vad_segments:
|
|
47
|
+
... print(f"Speech: {start:.2f} - {end:.2f}")
|
|
48
|
+
>>>
|
|
49
|
+
>>> # Or access the full TextGrid
|
|
50
|
+
>>> print(result.audio_events)
|
|
51
|
+
|
|
52
|
+
Performance Notes:
|
|
53
|
+
- GPU acceleration provides significant speedup (10x+ over CPU)
|
|
54
|
+
- Use dtype="float16" for faster inference with minimal accuracy loss
|
|
55
|
+
- fast_mode=True reduces computation by only detecting top_k classes
|
|
56
|
+
- Long audio files are automatically chunked to manage memory
|
|
57
|
+
|
|
58
|
+
See Also:
|
|
59
|
+
- lattifai.config.EventConfig: Configuration options
|
|
60
|
+
- lattifai_core.event: Core event detection implementation
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
from .lattifai import LattifAIEventDetector
|
|
64
|
+
|
|
65
|
+
__all__ = ["LattifAIEventDetector"]
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""LattifAI Audio Event Detection implementation."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import TYPE_CHECKING, Optional
|
|
5
|
+
|
|
6
|
+
from lattifai.audio2 import AudioData
|
|
7
|
+
from lattifai.config.event import EventConfig
|
|
8
|
+
from lattifai.logging import get_logger
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from lattifai_core.event import LEDOutput
|
|
12
|
+
|
|
13
|
+
from lattifai.data import Caption
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
|
17
|
+
logging.basicConfig(format=formatter, level=logging.INFO)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class LattifAIEventDetector:
|
|
21
|
+
"""
|
|
22
|
+
LattifAI Audio Event Detector.
|
|
23
|
+
|
|
24
|
+
This class provides a high-level interface for audio event detection,
|
|
25
|
+
wrapping the core LattifAIEventDetector from lattifai_core.
|
|
26
|
+
|
|
27
|
+
Attributes:
|
|
28
|
+
config: EventConfig configuration object.
|
|
29
|
+
|
|
30
|
+
Example:
|
|
31
|
+
>>> from lattifai.event import LattifAIEventDetector
|
|
32
|
+
>>> from lattifai.config import EventConfig
|
|
33
|
+
>>>
|
|
34
|
+
>>> config = EventConfig(enabled=True, device="cuda")
|
|
35
|
+
>>> detector = LattifAIEventDetector(config)
|
|
36
|
+
>>>
|
|
37
|
+
>>> # Detect events from audio data
|
|
38
|
+
>>> result = detector.detect(audio_data)
|
|
39
|
+
>>>
|
|
40
|
+
>>> # Access VAD segments directly
|
|
41
|
+
>>> for start, end in result.vad_segments:
|
|
42
|
+
... print(f"Speech: {start:.2f} - {end:.2f}")
|
|
43
|
+
>>>
|
|
44
|
+
>>> # Or access the full TextGrid
|
|
45
|
+
>>> for tier in result.audio_events.tiers:
|
|
46
|
+
... print(f"Event type: {tier.name}")
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __init__(self, config: EventConfig):
|
|
50
|
+
"""
|
|
51
|
+
Initialize LattifAI Audio Event Detector.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
config: EventConfig configuration.
|
|
55
|
+
"""
|
|
56
|
+
self.config = config
|
|
57
|
+
self.logger = get_logger("event")
|
|
58
|
+
self._detector = None
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def name(self) -> str:
|
|
62
|
+
"""Human-readable name of the detector."""
|
|
63
|
+
return "LattifAI_EventDetector"
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def detector(self):
|
|
67
|
+
"""Lazy-load and return the audio event detector."""
|
|
68
|
+
if self._detector is None:
|
|
69
|
+
from lattifai_core.event import LattifAIEventDetector as CoreEventDetector
|
|
70
|
+
|
|
71
|
+
self._detector = CoreEventDetector.from_pretrained(
|
|
72
|
+
model_path=self.config.model_path,
|
|
73
|
+
device=self.config.device,
|
|
74
|
+
client_wrapper=self.config.client_wrapper,
|
|
75
|
+
)
|
|
76
|
+
return self._detector
|
|
77
|
+
|
|
78
|
+
def detect(
|
|
79
|
+
self,
|
|
80
|
+
input_media: AudioData,
|
|
81
|
+
vad_chunk_size: Optional[float] = None,
|
|
82
|
+
vad_max_gap: Optional[float] = None,
|
|
83
|
+
fast_mode: Optional[bool] = None,
|
|
84
|
+
) -> "LEDOutput":
|
|
85
|
+
"""
|
|
86
|
+
Detect audio events in the input audio.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
input_media: Audio data to analyze.
|
|
90
|
+
vad_chunk_size: Override config vad_chunk_size.
|
|
91
|
+
vad_max_gap: Override config vad_max_gap.
|
|
92
|
+
fast_mode: Override config fast_mode.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
LEDOutput containing audio_events, event_names, vad_segments.
|
|
96
|
+
"""
|
|
97
|
+
return self.detector(
|
|
98
|
+
audio=input_media,
|
|
99
|
+
vad_chunk_size=vad_chunk_size or self.config.vad_chunk_size,
|
|
100
|
+
vad_max_gap=vad_max_gap or self.config.vad_max_gap,
|
|
101
|
+
fast_mode=fast_mode if fast_mode is not None else self.config.fast_mode,
|
|
102
|
+
custom_aliases=self.config.event_aliases or {},
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
def profiling(self, reset: bool = False) -> str:
|
|
106
|
+
"""Get profiling information for the detector."""
|
|
107
|
+
if self._detector is None:
|
|
108
|
+
return ""
|
|
109
|
+
return self.detector.profiling(reset=reset, logger=self.logger)
|
|
110
|
+
|
|
111
|
+
def detect_and_update_caption(
|
|
112
|
+
self,
|
|
113
|
+
caption: "Caption",
|
|
114
|
+
input_media: AudioData,
|
|
115
|
+
vad_chunk_size: Optional[float] = None,
|
|
116
|
+
vad_max_gap: Optional[float] = None,
|
|
117
|
+
fast_mode: Optional[bool] = None,
|
|
118
|
+
) -> "Caption":
|
|
119
|
+
"""
|
|
120
|
+
Run event detection and update caption with audio events.
|
|
121
|
+
|
|
122
|
+
This is the main entry point for integrating event detection with alignment.
|
|
123
|
+
When event_matching is enabled, it also updates caption timestamps for [Event] markers.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
audio: AudioData to analyze
|
|
127
|
+
caption: Caption to update with event detection results
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
Updated Caption with event field populated
|
|
131
|
+
"""
|
|
132
|
+
# Event matching: update caption timestamps based on detected events
|
|
133
|
+
if self.config.event_matching:
|
|
134
|
+
# Get supervisions to process
|
|
135
|
+
supervisions = caption.alignments or caption.supervisions
|
|
136
|
+
|
|
137
|
+
led_output, supervisions = self.detector.detect_and_update_supervisions(
|
|
138
|
+
supervisions=supervisions,
|
|
139
|
+
audio=input_media,
|
|
140
|
+
vad_chunk_size=vad_chunk_size or self.config.vad_chunk_size,
|
|
141
|
+
vad_max_gap=vad_max_gap or self.config.vad_max_gap,
|
|
142
|
+
fast_mode=fast_mode if fast_mode is not None else self.config.fast_mode,
|
|
143
|
+
custom_aliases=self.config.event_aliases or {},
|
|
144
|
+
extra_events=self.config.extra_events or None,
|
|
145
|
+
time_tolerance=self.config.time_tolerance,
|
|
146
|
+
update_timestamps=self.config.update_timestamps,
|
|
147
|
+
duplicate_strategy=self.config.duplicate_strategy,
|
|
148
|
+
)
|
|
149
|
+
# Store LEDOutput in caption
|
|
150
|
+
caption.event = led_output
|
|
151
|
+
|
|
152
|
+
if caption.alignments:
|
|
153
|
+
caption.alignments = supervisions
|
|
154
|
+
else:
|
|
155
|
+
caption.supervisions = supervisions
|
|
156
|
+
else:
|
|
157
|
+
# Simple detection without event matching
|
|
158
|
+
led_output = self.detect(
|
|
159
|
+
input_media=input_media,
|
|
160
|
+
vad_chunk_size=vad_chunk_size,
|
|
161
|
+
vad_max_gap=vad_max_gap,
|
|
162
|
+
fast_mode=fast_mode,
|
|
163
|
+
)
|
|
164
|
+
caption.event = led_output
|
|
165
|
+
|
|
166
|
+
return caption
|
lattifai/mixin.py
CHANGED
|
@@ -8,12 +8,12 @@ import colorful
|
|
|
8
8
|
from lhotse.utils import Pathlike
|
|
9
9
|
|
|
10
10
|
from lattifai.audio2 import AudioData
|
|
11
|
-
from lattifai.
|
|
11
|
+
from lattifai.data import Caption
|
|
12
12
|
from lattifai.errors import CaptionProcessingError
|
|
13
13
|
from lattifai.utils import safe_print
|
|
14
14
|
|
|
15
15
|
if TYPE_CHECKING:
|
|
16
|
-
from .config import AlignmentConfig,
|
|
16
|
+
from .config import AlignmentConfig, DiarizationConfig, EventConfig, TranscriptionConfig
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
class LattifAIClientMixin:
|
|
@@ -170,9 +170,10 @@ class LattifAIClientMixin:
|
|
|
170
170
|
alignment_config: Optional["AlignmentConfig"],
|
|
171
171
|
transcription_config: Optional["TranscriptionConfig"],
|
|
172
172
|
diarization_config: Optional["DiarizationConfig"] = None,
|
|
173
|
+
event_config: Optional["EventConfig"] = None,
|
|
173
174
|
) -> tuple:
|
|
174
175
|
"""Initialize all configs with defaults if not provided."""
|
|
175
|
-
from .config import AlignmentConfig, DiarizationConfig, TranscriptionConfig
|
|
176
|
+
from .config import AlignmentConfig, DiarizationConfig, EventConfig, TranscriptionConfig
|
|
176
177
|
|
|
177
178
|
if alignment_config is None:
|
|
178
179
|
alignment_config = AlignmentConfig()
|
|
@@ -180,20 +181,24 @@ class LattifAIClientMixin:
|
|
|
180
181
|
transcription_config = TranscriptionConfig()
|
|
181
182
|
if diarization_config is None:
|
|
182
183
|
diarization_config = DiarizationConfig()
|
|
184
|
+
if event_config is None:
|
|
185
|
+
event_config = EventConfig()
|
|
183
186
|
|
|
184
187
|
from lattifai.utils import _resolve_model_path
|
|
185
188
|
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
189
|
+
model_path = _resolve_model_path(
|
|
190
|
+
alignment_config.model_name, getattr(alignment_config, "model_hub", "modelscope")
|
|
191
|
+
)
|
|
192
|
+
transcription_config.lattice_model_path = model_path
|
|
193
|
+
event_config.model_path = model_path
|
|
190
194
|
|
|
191
195
|
# Set client_wrapper for all configs
|
|
192
196
|
alignment_config.client_wrapper = self
|
|
193
197
|
transcription_config.client_wrapper = self
|
|
194
198
|
diarization_config.client_wrapper = self
|
|
199
|
+
event_config.client_wrapper = self
|
|
195
200
|
|
|
196
|
-
return alignment_config, transcription_config, diarization_config
|
|
201
|
+
return alignment_config, transcription_config, diarization_config, event_config
|
|
197
202
|
|
|
198
203
|
def _init_shared_components(
|
|
199
204
|
self,
|
|
@@ -220,19 +225,16 @@ class LattifAIClientMixin:
|
|
|
220
225
|
def downloader(self):
|
|
221
226
|
"""Lazy load YouTube downloader."""
|
|
222
227
|
if self._downloader is None:
|
|
223
|
-
from .
|
|
228
|
+
from .youtube import YouTubeDownloader
|
|
224
229
|
|
|
225
230
|
self._downloader = YouTubeDownloader()
|
|
226
231
|
return self._downloader
|
|
227
232
|
|
|
228
233
|
def _prepare_youtube_output_dir(self, output_dir: Optional["Pathlike"]) -> Path:
|
|
229
234
|
"""Prepare and return output directory for YouTube downloads."""
|
|
230
|
-
if output_dir
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
output_dir = Path(output_dir).expanduser()
|
|
234
|
-
output_dir.mkdir(parents=True, exist_ok=True)
|
|
235
|
-
return output_dir
|
|
235
|
+
output_path = Path(output_dir).expanduser() if output_dir else Path(tempfile.gettempdir()) / "lattifai_youtube"
|
|
236
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
237
|
+
return output_path
|
|
236
238
|
|
|
237
239
|
def _determine_media_format(self, media_format: Optional[str]) -> str:
|
|
238
240
|
"""Determine media format from parameter or config."""
|
|
@@ -242,11 +244,11 @@ class LattifAIClientMixin:
|
|
|
242
244
|
self, output_caption_path: Optional["Pathlike"], media_file: str, output_dir: Path
|
|
243
245
|
) -> Path:
|
|
244
246
|
"""Generate output caption path if not provided."""
|
|
245
|
-
if
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
return
|
|
247
|
+
if output_caption_path:
|
|
248
|
+
return Path(output_caption_path)
|
|
249
|
+
media_name = Path(media_file).stem
|
|
250
|
+
output_format = self.caption_config.output_format or "srt"
|
|
251
|
+
return output_dir / f"{media_name}_LattifAI.{output_format}"
|
|
250
252
|
|
|
251
253
|
def _validate_transcription_setup(self) -> None:
|
|
252
254
|
"""Validate that transcription is properly configured if requested."""
|
|
@@ -287,18 +289,18 @@ class LattifAIClientMixin:
|
|
|
287
289
|
format=input_caption_format,
|
|
288
290
|
normalize_text=normalize_text if normalize_text is not None else self.caption_config.normalize_text,
|
|
289
291
|
)
|
|
290
|
-
diarization_file = Path(str(input_caption)).with_suffix(".
|
|
292
|
+
diarization_file = Path(str(input_caption)).with_suffix(".Diarization")
|
|
291
293
|
if diarization_file.exists():
|
|
292
294
|
if verbose:
|
|
293
295
|
safe_print(colorful.cyan(f"📖 Step1b: Reading speaker diarization from {diarization_file}"))
|
|
294
|
-
caption.
|
|
295
|
-
|
|
296
|
-
if
|
|
296
|
+
caption.read_diarization(diarization_file)
|
|
297
|
+
event_file = Path(str(input_caption)).with_suffix(".LED")
|
|
298
|
+
if event_file.exists():
|
|
297
299
|
if verbose:
|
|
298
|
-
safe_print(colorful.cyan(f"📖 Step1c: Reading audio events from {
|
|
299
|
-
from
|
|
300
|
+
safe_print(colorful.cyan(f"📖 Step1c: Reading audio events from {event_file}"))
|
|
301
|
+
from lattifai_core.event import LEDOutput
|
|
300
302
|
|
|
301
|
-
caption.
|
|
303
|
+
caption.event = LEDOutput.read(event_file)
|
|
302
304
|
|
|
303
305
|
if verbose:
|
|
304
306
|
safe_print(colorful.green(f" ✓ Parsed {len(caption)} caption segments"))
|
|
@@ -332,11 +334,13 @@ class LattifAIClientMixin:
|
|
|
332
334
|
result = caption.write(
|
|
333
335
|
output_caption_path,
|
|
334
336
|
include_speaker_in_text=self.caption_config.include_speaker_in_text,
|
|
337
|
+
word_level=self.caption_config.word_level,
|
|
338
|
+
karaoke_config=self.caption_config.karaoke,
|
|
335
339
|
)
|
|
336
340
|
diarization_file = Path(str(output_caption_path)).with_suffix(".SpkDiar")
|
|
337
|
-
if not diarization_file.exists() and caption.
|
|
341
|
+
if not diarization_file.exists() and caption.diarization:
|
|
338
342
|
safe_print(colorful.green(f" Writing speaker diarization to: {diarization_file}"))
|
|
339
|
-
caption.
|
|
343
|
+
caption.write_diarization(diarization_file)
|
|
340
344
|
|
|
341
345
|
safe_print(colorful.green(f"🎉🎉🎉🎉🎉 Caption file written to: {output_caption_path}"))
|
|
342
346
|
return result
|
|
@@ -353,14 +357,22 @@ class LattifAIClientMixin:
|
|
|
353
357
|
output_dir: Path,
|
|
354
358
|
media_format: str,
|
|
355
359
|
force_overwrite: bool,
|
|
360
|
+
audio_track_id: Optional[str] = "original",
|
|
361
|
+
quality: str = "best",
|
|
356
362
|
) -> str:
|
|
357
363
|
"""Download media from YouTube (async implementation)."""
|
|
358
364
|
safe_print(colorful.cyan("📥 Downloading media from YouTube..."))
|
|
365
|
+
if audio_track_id:
|
|
366
|
+
safe_print(colorful.cyan(f" Audio track: {audio_track_id}"))
|
|
367
|
+
if quality != "best":
|
|
368
|
+
safe_print(colorful.cyan(f" Quality: {quality}"))
|
|
359
369
|
media_file = await self.downloader.download_media(
|
|
360
370
|
url=url,
|
|
361
371
|
output_dir=str(output_dir),
|
|
362
372
|
media_format=media_format,
|
|
363
373
|
force_overwrite=force_overwrite,
|
|
374
|
+
audio_track_id=audio_track_id,
|
|
375
|
+
quality=quality,
|
|
364
376
|
)
|
|
365
377
|
safe_print(colorful.green(f" ✓ Media downloaded: {media_file}"))
|
|
366
378
|
return media_file
|
|
@@ -371,11 +383,15 @@ class LattifAIClientMixin:
|
|
|
371
383
|
output_dir: Path,
|
|
372
384
|
media_format: str,
|
|
373
385
|
force_overwrite: bool,
|
|
386
|
+
audio_track_id: Optional[str] = "original",
|
|
387
|
+
quality: str = "best",
|
|
374
388
|
) -> str:
|
|
375
389
|
"""Download media from YouTube (sync wrapper)."""
|
|
376
390
|
import asyncio
|
|
377
391
|
|
|
378
|
-
return asyncio.run(
|
|
392
|
+
return asyncio.run(
|
|
393
|
+
self._download_media(url, output_dir, media_format, force_overwrite, audio_track_id, quality)
|
|
394
|
+
)
|
|
379
395
|
|
|
380
396
|
def _transcribe(
|
|
381
397
|
self,
|
|
@@ -408,7 +424,7 @@ class LattifAIClientMixin:
|
|
|
408
424
|
# Generate transcript file path
|
|
409
425
|
transcript_file = output_dir / f"{Path(str(media_file)).stem}_{self.transcriber.file_name}"
|
|
410
426
|
if transcript_file.exists():
|
|
411
|
-
safe_print(colorful.cyan(f"
|
|
427
|
+
safe_print(colorful.cyan(f" Using existing transcript file: {transcript_file}"))
|
|
412
428
|
transcription = self._read_caption(transcript_file, normalize_text=False)
|
|
413
429
|
return transcription
|
|
414
430
|
|
|
@@ -485,11 +501,12 @@ class LattifAIClientMixin:
|
|
|
485
501
|
"""
|
|
486
502
|
import asyncio
|
|
487
503
|
|
|
488
|
-
from lattifai.workflow.
|
|
504
|
+
from lattifai.workflow.file_manager import TRANSCRIBE_CHOICE
|
|
489
505
|
|
|
490
506
|
transcriber_name = self.transcriber.name
|
|
491
507
|
|
|
492
508
|
async def _async_impl():
|
|
509
|
+
nonlocal use_transcription # Allow modification of outer variable
|
|
493
510
|
# First check if caption input_path is already provided
|
|
494
511
|
if self.caption_config.input_path:
|
|
495
512
|
caption_path = Path(self.caption_config.input_path)
|