lattifai 1.2.2__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/_init.py +20 -0
- lattifai/alignment/__init__.py +2 -3
- lattifai/alignment/lattice1_aligner.py +117 -4
- lattifai/alignment/lattice1_worker.py +47 -4
- lattifai/alignment/segmenter.py +3 -2
- lattifai/alignment/text_align.py +2 -1
- lattifai/alignment/tokenizer.py +56 -29
- lattifai/audio2.py +162 -183
- lattifai/cli/alignment.py +5 -0
- lattifai/cli/caption.py +6 -6
- lattifai/cli/transcribe.py +1 -5
- lattifai/cli/youtube.py +3 -0
- lattifai/client.py +41 -12
- lattifai/config/__init__.py +21 -3
- lattifai/config/alignment.py +7 -0
- lattifai/config/caption.py +13 -243
- lattifai/config/client.py +16 -0
- lattifai/config/event.py +102 -0
- lattifai/config/transcription.py +25 -1
- lattifai/data/__init__.py +8 -0
- lattifai/data/caption.py +228 -0
- lattifai/errors.py +78 -53
- lattifai/event/__init__.py +65 -0
- lattifai/event/lattifai.py +166 -0
- lattifai/mixin.py +22 -17
- lattifai/transcription/base.py +2 -1
- lattifai/transcription/gemini.py +147 -16
- lattifai/transcription/lattifai.py +8 -11
- lattifai/types.py +1 -1
- lattifai/youtube/client.py +143 -48
- {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/METADATA +129 -58
- lattifai-1.3.1.dist-info/RECORD +57 -0
- lattifai/__init__.py +0 -88
- lattifai/alignment/sentence_splitter.py +0 -350
- lattifai/caption/__init__.py +0 -96
- lattifai/caption/caption.py +0 -661
- lattifai/caption/formats/__init__.py +0 -199
- lattifai/caption/formats/base.py +0 -211
- lattifai/caption/formats/gemini.py +0 -722
- lattifai/caption/formats/json.py +0 -194
- lattifai/caption/formats/lrc.py +0 -309
- lattifai/caption/formats/nle/__init__.py +0 -9
- lattifai/caption/formats/nle/audition.py +0 -561
- lattifai/caption/formats/nle/avid.py +0 -423
- lattifai/caption/formats/nle/fcpxml.py +0 -549
- lattifai/caption/formats/nle/premiere.py +0 -589
- lattifai/caption/formats/pysubs2.py +0 -642
- lattifai/caption/formats/sbv.py +0 -147
- lattifai/caption/formats/tabular.py +0 -338
- lattifai/caption/formats/textgrid.py +0 -193
- lattifai/caption/formats/ttml.py +0 -652
- lattifai/caption/formats/vtt.py +0 -469
- lattifai/caption/parsers/__init__.py +0 -9
- lattifai/caption/parsers/text_parser.py +0 -147
- lattifai/caption/standardize.py +0 -636
- lattifai/caption/supervision.py +0 -34
- lattifai/caption/utils.py +0 -474
- lattifai-1.2.2.dist-info/RECORD +0 -76
- {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/WHEEL +0 -0
- {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/entry_points.txt +0 -0
- {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/top_level.txt +0 -0
lattifai/data/caption.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
"""Extended Caption class with transcription, alignment, and diarization support."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeVar
|
|
7
|
+
|
|
8
|
+
from lattifai.caption import Caption as BaseCaption
|
|
9
|
+
from lattifai.caption import Pathlike, Supervision
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from lattifai_core.event import LEDOutput
|
|
13
|
+
|
|
14
|
+
DiarizationOutput = TypeVar("DiarizationOutput")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class Caption(BaseCaption):
|
|
19
|
+
"""
|
|
20
|
+
Extended Caption with transcription, alignment, and diarization support.
|
|
21
|
+
|
|
22
|
+
Inherits from BaseCaption and adds fields for:
|
|
23
|
+
- alignments: Post-alignment results
|
|
24
|
+
- transcription: ASR results
|
|
25
|
+
- event: LattifAI Event Detection results (LEDOutput)
|
|
26
|
+
- diarization: Speaker diarization results
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
These fields are used in the LattifAI pipeline for:
|
|
30
|
+
- Forced alignment results
|
|
31
|
+
- Storing intermediate transcription results
|
|
32
|
+
- LattifAI Event Detection (music, applause, speech, etc.)
|
|
33
|
+
- Speaker identification and separation
|
|
34
|
+
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
# Alignment results
|
|
38
|
+
alignments: List[Supervision] = field(default_factory=list)
|
|
39
|
+
|
|
40
|
+
# Transcription results
|
|
41
|
+
transcription: List[Supervision] = field(default_factory=list)
|
|
42
|
+
|
|
43
|
+
# LattifAI Event Detection results
|
|
44
|
+
event: Optional["LEDOutput"] = None
|
|
45
|
+
|
|
46
|
+
# Speaker Diarization results
|
|
47
|
+
diarization: Optional[DiarizationOutput] = None
|
|
48
|
+
|
|
49
|
+
def __len__(self) -> int:
|
|
50
|
+
"""Return the number of supervision segments."""
|
|
51
|
+
return len(self.supervisions or self.transcription)
|
|
52
|
+
|
|
53
|
+
def __repr__(self) -> str:
|
|
54
|
+
"""String representation of Caption."""
|
|
55
|
+
lang = f"lang={self.language}" if self.language else "lang=unknown"
|
|
56
|
+
kind_str = f"kind={self.kind}" if self.kind else ""
|
|
57
|
+
parts = [f"Caption({len(self.supervisions or self.transcription)} segments", lang]
|
|
58
|
+
if kind_str:
|
|
59
|
+
parts.append(kind_str)
|
|
60
|
+
if self.duration:
|
|
61
|
+
parts.append(f"duration={self.duration:.2f}s")
|
|
62
|
+
return ", ".join(parts) + ")"
|
|
63
|
+
|
|
64
|
+
def with_margins(
|
|
65
|
+
self,
|
|
66
|
+
start_margin: float = 0.08,
|
|
67
|
+
end_margin: float = 0.20,
|
|
68
|
+
min_gap: float = 0.08,
|
|
69
|
+
collision_mode: str = "trim",
|
|
70
|
+
) -> "Caption":
|
|
71
|
+
"""
|
|
72
|
+
Create a new Caption with segment boundaries adjusted based on word-level alignment.
|
|
73
|
+
|
|
74
|
+
Uses supervision.alignment['word'] to recalculate segment start/end times
|
|
75
|
+
with the specified margins applied around the actual speech boundaries.
|
|
76
|
+
|
|
77
|
+
Prefers alignments > supervisions > transcription as source.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
start_margin: Seconds to extend before the first word (default: 0.08)
|
|
81
|
+
end_margin: Seconds to extend after the last word (default: 0.20)
|
|
82
|
+
min_gap: Minimum gap between segments for collision handling (default: 0.08)
|
|
83
|
+
collision_mode: How to handle segment overlap - 'trim' or 'gap' (default: 'trim')
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
New Caption instance with adjusted timestamps
|
|
87
|
+
|
|
88
|
+
Note:
|
|
89
|
+
Segments without alignment data will keep their original timestamps.
|
|
90
|
+
"""
|
|
91
|
+
from lattifai.caption.standardize import apply_margins_to_captions
|
|
92
|
+
|
|
93
|
+
# Determine which supervisions to use (priority: alignments > supervisions > transcription)
|
|
94
|
+
if self.alignments:
|
|
95
|
+
source_sups = self.alignments
|
|
96
|
+
elif self.supervisions:
|
|
97
|
+
source_sups = self.supervisions
|
|
98
|
+
else:
|
|
99
|
+
source_sups = self.transcription
|
|
100
|
+
|
|
101
|
+
adjusted_sups = apply_margins_to_captions(
|
|
102
|
+
source_sups,
|
|
103
|
+
start_margin=start_margin,
|
|
104
|
+
end_margin=end_margin,
|
|
105
|
+
min_gap=min_gap,
|
|
106
|
+
collision_mode=collision_mode,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
return Caption(
|
|
110
|
+
supervisions=adjusted_sups,
|
|
111
|
+
transcription=self.transcription,
|
|
112
|
+
event=self.event,
|
|
113
|
+
diarization=self.diarization,
|
|
114
|
+
alignments=[], # Clear alignments since we've applied them
|
|
115
|
+
language=self.language,
|
|
116
|
+
kind=self.kind,
|
|
117
|
+
source_format=self.source_format,
|
|
118
|
+
source_path=self.source_path,
|
|
119
|
+
metadata=self.metadata.copy() if self.metadata else {},
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
def write(
|
|
123
|
+
self,
|
|
124
|
+
path=None,
|
|
125
|
+
output_format: Optional[str] = None,
|
|
126
|
+
include_speaker_in_text: bool = True,
|
|
127
|
+
word_level: bool = False,
|
|
128
|
+
karaoke_config=None,
|
|
129
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
130
|
+
):
|
|
131
|
+
"""
|
|
132
|
+
Write caption to file or return as bytes.
|
|
133
|
+
|
|
134
|
+
Prefers alignments > supervisions > transcription as source.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
path: Path to output caption file, BytesIO object, or None to return bytes
|
|
138
|
+
output_format: Output format (e.g., 'srt', 'vtt', 'ass')
|
|
139
|
+
include_speaker_in_text: Whether to include speaker labels in text
|
|
140
|
+
word_level: Use word-level output format if supported
|
|
141
|
+
karaoke_config: Karaoke configuration
|
|
142
|
+
metadata: Optional metadata dict to pass to writer
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Path to the written file if path is a file path, or bytes if path is BytesIO/None
|
|
146
|
+
"""
|
|
147
|
+
# Temporarily swap supervisions to use the priority order
|
|
148
|
+
original_supervisions = self.supervisions
|
|
149
|
+
|
|
150
|
+
if self.alignments:
|
|
151
|
+
self.supervisions = self.alignments
|
|
152
|
+
elif not self.supervisions and self.transcription:
|
|
153
|
+
self.supervisions = self.transcription
|
|
154
|
+
|
|
155
|
+
try:
|
|
156
|
+
result = super().write(
|
|
157
|
+
path=path,
|
|
158
|
+
output_format=output_format,
|
|
159
|
+
include_speaker_in_text=include_speaker_in_text,
|
|
160
|
+
word_level=word_level,
|
|
161
|
+
karaoke_config=karaoke_config,
|
|
162
|
+
metadata=metadata,
|
|
163
|
+
)
|
|
164
|
+
finally:
|
|
165
|
+
# Restore original supervisions
|
|
166
|
+
self.supervisions = original_supervisions
|
|
167
|
+
|
|
168
|
+
return result
|
|
169
|
+
|
|
170
|
+
@classmethod
|
|
171
|
+
def from_transcription_results(
|
|
172
|
+
cls,
|
|
173
|
+
transcription: List[Supervision],
|
|
174
|
+
event: Optional["LEDOutput"] = None,
|
|
175
|
+
diarization: Optional[DiarizationOutput] = None,
|
|
176
|
+
language: Optional[str] = None,
|
|
177
|
+
source_path: Optional[Pathlike] = None,
|
|
178
|
+
metadata: Optional[Dict[str, str]] = None,
|
|
179
|
+
) -> "Caption":
|
|
180
|
+
"""
|
|
181
|
+
Create Caption from transcription results including audio events and diarization.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
transcription: List of transcription supervision segments
|
|
185
|
+
event: Optional LEDOutput with event detection results
|
|
186
|
+
diarization: Optional DiarizationOutput with speaker diarization results
|
|
187
|
+
language: Language code
|
|
188
|
+
source_path: Source file path
|
|
189
|
+
metadata: Additional metadata
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
New Caption instance with transcription data
|
|
193
|
+
"""
|
|
194
|
+
return cls(
|
|
195
|
+
transcription=transcription,
|
|
196
|
+
event=event,
|
|
197
|
+
diarization=diarization,
|
|
198
|
+
language=language,
|
|
199
|
+
kind="transcription",
|
|
200
|
+
source_format="asr",
|
|
201
|
+
source_path=source_path,
|
|
202
|
+
metadata=metadata or {},
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
def read_diarization(
|
|
206
|
+
self,
|
|
207
|
+
path: Pathlike,
|
|
208
|
+
) -> "DiarizationOutput":
|
|
209
|
+
"""
|
|
210
|
+
Read speaker diarization TextGrid from file.
|
|
211
|
+
"""
|
|
212
|
+
from lattifai_core.diarization import DiarizationOutput
|
|
213
|
+
|
|
214
|
+
self.diarization = DiarizationOutput.read(path)
|
|
215
|
+
return self.diarization
|
|
216
|
+
|
|
217
|
+
def write_diarization(
|
|
218
|
+
self,
|
|
219
|
+
path: Pathlike,
|
|
220
|
+
) -> Pathlike:
|
|
221
|
+
"""
|
|
222
|
+
Write speaker diarization TextGrid to file.
|
|
223
|
+
"""
|
|
224
|
+
if not self.diarization:
|
|
225
|
+
raise ValueError("No speaker diarization data to write.")
|
|
226
|
+
|
|
227
|
+
self.diarization.write(path)
|
|
228
|
+
return path
|
lattifai/errors.py
CHANGED
|
@@ -1,10 +1,42 @@
|
|
|
1
1
|
"""Error handling and exception classes for LattifAI SDK."""
|
|
2
2
|
|
|
3
|
+
import functools
|
|
3
4
|
import traceback
|
|
4
5
|
from typing import Any, Dict, Optional
|
|
5
6
|
|
|
6
7
|
import colorful
|
|
7
8
|
|
|
9
|
+
|
|
10
|
+
def format_exception(e: "LattifAIError") -> str:
|
|
11
|
+
"""Format LattifAIError with filtered traceback (only lattifai frames)."""
|
|
12
|
+
tb_lines = traceback.format_exception(type(e), e, e.__traceback__)
|
|
13
|
+
filtered = []
|
|
14
|
+
skip_next_code_line = False
|
|
15
|
+
|
|
16
|
+
for i, line in enumerate(tb_lines):
|
|
17
|
+
if skip_next_code_line:
|
|
18
|
+
skip_next_code_line = False
|
|
19
|
+
continue
|
|
20
|
+
|
|
21
|
+
if line.startswith("Traceback") or not line.startswith(" File"):
|
|
22
|
+
filtered.append(line)
|
|
23
|
+
elif "lattifai" in line:
|
|
24
|
+
filtered.append(line)
|
|
25
|
+
if i + 1 < len(tb_lines) and tb_lines[i + 1].startswith(" "):
|
|
26
|
+
filtered.append(tb_lines[i + 1])
|
|
27
|
+
skip_next_code_line = True
|
|
28
|
+
elif i + 1 < len(tb_lines) and tb_lines[i + 1].startswith(" "):
|
|
29
|
+
skip_next_code_line = True
|
|
30
|
+
|
|
31
|
+
return "".join(filtered)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _merge_context(kwargs: Dict[str, Any], updates: Dict[str, Any]) -> None:
|
|
35
|
+
"""Merge updates into kwargs['context'], creating it if needed."""
|
|
36
|
+
context = kwargs.setdefault("context", {})
|
|
37
|
+
context.update(updates)
|
|
38
|
+
|
|
39
|
+
|
|
8
40
|
# Error help messages
|
|
9
41
|
LATTICE_DECODING_FAILURE_HELP = (
|
|
10
42
|
"Failed to decode lattice alignment. Possible reasons:\n\n"
|
|
@@ -76,10 +108,8 @@ class AudioProcessingError(LattifAIError):
|
|
|
76
108
|
"""Error during audio processing operations."""
|
|
77
109
|
|
|
78
110
|
def __init__(self, message: str, media_path: Optional[str] = None, **kwargs):
|
|
79
|
-
context = kwargs.get("context", {})
|
|
80
111
|
if media_path:
|
|
81
|
-
|
|
82
|
-
kwargs["context"] = context
|
|
112
|
+
_merge_context(kwargs, {"media_path": media_path})
|
|
83
113
|
super().__init__(message, **kwargs)
|
|
84
114
|
|
|
85
115
|
|
|
@@ -90,11 +120,9 @@ class AudioLoadError(AudioProcessingError):
|
|
|
90
120
|
message = f"Failed to load audio file: {colorful.red(media_path)}"
|
|
91
121
|
if original_error:
|
|
92
122
|
message += f" - {colorful.red(str(original_error))}"
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
kwargs["context"] = context
|
|
97
|
-
|
|
123
|
+
_merge_context(
|
|
124
|
+
kwargs, {"media_path": media_path, "original_error": str(original_error) if original_error else None}
|
|
125
|
+
)
|
|
98
126
|
super().__init__(message, media_path=media_path, **kwargs)
|
|
99
127
|
|
|
100
128
|
|
|
@@ -103,9 +131,7 @@ class AudioFormatError(AudioProcessingError):
|
|
|
103
131
|
|
|
104
132
|
def __init__(self, media_path: str, format_issue: str, **kwargs):
|
|
105
133
|
message = f"Audio format error for {colorful.red(media_path)}: {colorful.red(format_issue)}"
|
|
106
|
-
|
|
107
|
-
context.update({"media_path": media_path, "format_issue": format_issue})
|
|
108
|
-
kwargs["context"] = context
|
|
134
|
+
_merge_context(kwargs, {"media_path": media_path, "format_issue": format_issue})
|
|
109
135
|
super().__init__(message, media_path=media_path, **kwargs)
|
|
110
136
|
|
|
111
137
|
|
|
@@ -113,10 +139,8 @@ class CaptionProcessingError(LattifAIError):
|
|
|
113
139
|
"""Error during caption/text processing operations."""
|
|
114
140
|
|
|
115
141
|
def __init__(self, message: str, caption_path: Optional[str] = None, **kwargs):
|
|
116
|
-
context = kwargs.get("context", {})
|
|
117
142
|
if caption_path:
|
|
118
|
-
|
|
119
|
-
kwargs["context"] = context
|
|
143
|
+
_merge_context(kwargs, {"caption_path": caption_path})
|
|
120
144
|
super().__init__(message, **kwargs)
|
|
121
145
|
|
|
122
146
|
|
|
@@ -125,9 +149,7 @@ class CaptionParseError(CaptionProcessingError):
|
|
|
125
149
|
|
|
126
150
|
def __init__(self, caption_path: str, parse_issue: str, **kwargs):
|
|
127
151
|
message = f"Failed to parse caption file {caption_path}: {parse_issue}"
|
|
128
|
-
|
|
129
|
-
context.update({"caption_path": caption_path, "parse_issue": parse_issue})
|
|
130
|
-
kwargs["context"] = context
|
|
152
|
+
_merge_context(kwargs, {"caption_path": caption_path, "parse_issue": parse_issue})
|
|
131
153
|
super().__init__(message, caption_path=caption_path, **kwargs)
|
|
132
154
|
|
|
133
155
|
|
|
@@ -135,12 +157,13 @@ class AlignmentError(LattifAIError):
|
|
|
135
157
|
"""Error during audio-text alignment process."""
|
|
136
158
|
|
|
137
159
|
def __init__(self, message: str, media_path: Optional[str] = None, caption_path: Optional[str] = None, **kwargs):
|
|
138
|
-
|
|
160
|
+
updates = {}
|
|
139
161
|
if media_path:
|
|
140
|
-
|
|
162
|
+
updates["media_path"] = media_path
|
|
141
163
|
if caption_path:
|
|
142
|
-
|
|
143
|
-
|
|
164
|
+
updates["caption_path"] = caption_path
|
|
165
|
+
if updates:
|
|
166
|
+
_merge_context(kwargs, updates)
|
|
144
167
|
super().__init__(message, **kwargs)
|
|
145
168
|
|
|
146
169
|
|
|
@@ -151,36 +174,44 @@ class LatticeEncodingError(AlignmentError):
|
|
|
151
174
|
message = "Failed to generate lattice graph from text"
|
|
152
175
|
if original_error:
|
|
153
176
|
message += f": {colorful.red(str(original_error))}"
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
177
|
+
text_preview = text_content[:100] + "..." if len(text_content) > 100 else text_content
|
|
178
|
+
_merge_context(
|
|
179
|
+
kwargs,
|
|
157
180
|
{
|
|
158
181
|
"text_content_length": len(text_content),
|
|
159
|
-
"text_preview":
|
|
182
|
+
"text_preview": text_preview,
|
|
160
183
|
"original_error": str(original_error) if original_error else None,
|
|
161
|
-
}
|
|
184
|
+
},
|
|
162
185
|
)
|
|
163
|
-
kwargs["context"] = context
|
|
164
186
|
super().__init__(message, **kwargs)
|
|
165
187
|
|
|
166
188
|
|
|
167
189
|
class LatticeDecodingError(AlignmentError):
|
|
168
190
|
"""Error decoding lattice alignment results."""
|
|
169
191
|
|
|
170
|
-
def __init__(
|
|
171
|
-
|
|
192
|
+
def __init__(
|
|
193
|
+
self,
|
|
194
|
+
lattice_id: str,
|
|
195
|
+
message: Optional[str] = None,
|
|
196
|
+
original_error: Optional[Exception] = None,
|
|
197
|
+
skip_help: bool = False,
|
|
198
|
+
**kwargs,
|
|
199
|
+
):
|
|
200
|
+
message = message or f"Failed to decode lattice alignment results for lattice ID: {colorful.red(lattice_id)}"
|
|
172
201
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
202
|
+
error_str = str(original_error) if original_error else None
|
|
203
|
+
is_help_message = error_str == LATTICE_DECODING_FAILURE_HELP
|
|
204
|
+
|
|
205
|
+
if original_error and not is_help_message:
|
|
206
|
+
message += f" - {colorful.red(error_str)}"
|
|
207
|
+
|
|
208
|
+
context_updates = {"lattice_id": lattice_id}
|
|
209
|
+
if original_error and not is_help_message:
|
|
210
|
+
context_updates["original_error"] = error_str
|
|
211
|
+
_merge_context(kwargs, context_updates)
|
|
176
212
|
|
|
177
|
-
context = kwargs.get("context", {})
|
|
178
|
-
# Don't store the entire help message in context to avoid duplication
|
|
179
|
-
if original_error and str(original_error) != LATTICE_DECODING_FAILURE_HELP:
|
|
180
|
-
context["original_error"] = str(original_error)
|
|
181
|
-
context["lattice_id"] = lattice_id
|
|
182
|
-
kwargs["context"] = context
|
|
183
213
|
super().__init__(message, **kwargs)
|
|
214
|
+
self.skip_help = skip_help
|
|
184
215
|
|
|
185
216
|
def get_message(self) -> str:
|
|
186
217
|
"""Return formatted error message with help text."""
|
|
@@ -188,8 +219,9 @@ class LatticeDecodingError(AlignmentError):
|
|
|
188
219
|
if self.context and self.context.get("lattice_id"):
|
|
189
220
|
# Only show essential context (lattice_id), not the duplicated help message
|
|
190
221
|
base_message += f'\n{colorful.yellow("Lattice ID:")} {self.context["lattice_id"]}'
|
|
191
|
-
# Append help message
|
|
192
|
-
|
|
222
|
+
# Append help message only if not skipped (e.g., when anomaly info is provided)
|
|
223
|
+
if not self.skip_help:
|
|
224
|
+
base_message += f"\n\n{colorful.yellow(LATTICE_DECODING_FAILURE_HELP)}"
|
|
193
225
|
return base_message
|
|
194
226
|
|
|
195
227
|
|
|
@@ -200,10 +232,9 @@ class ModelLoadError(LattifAIError):
|
|
|
200
232
|
message = f"Failed to load model: {colorful.red(model_name)}"
|
|
201
233
|
if original_error:
|
|
202
234
|
message += f" - {colorful.red(str(original_error))}"
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
kwargs["context"] = context
|
|
235
|
+
_merge_context(
|
|
236
|
+
kwargs, {"model_name": model_name, "original_error": str(original_error) if original_error else None}
|
|
237
|
+
)
|
|
207
238
|
super().__init__(message, **kwargs)
|
|
208
239
|
|
|
209
240
|
|
|
@@ -214,10 +245,7 @@ class DependencyError(LattifAIError):
|
|
|
214
245
|
message = f"Missing required dependency: {colorful.red(dependency_name)}"
|
|
215
246
|
if install_command:
|
|
216
247
|
message += f"\nPlease install it using: {colorful.yellow(install_command)}"
|
|
217
|
-
|
|
218
|
-
context = kwargs.get("context", {})
|
|
219
|
-
context.update({"dependency_name": dependency_name, "install_command": install_command})
|
|
220
|
-
kwargs["context"] = context
|
|
248
|
+
_merge_context(kwargs, {"dependency_name": dependency_name, "install_command": install_command})
|
|
221
249
|
super().__init__(message, **kwargs)
|
|
222
250
|
|
|
223
251
|
|
|
@@ -225,9 +253,7 @@ class APIError(LattifAIError):
|
|
|
225
253
|
"""Error communicating with LattifAI API."""
|
|
226
254
|
|
|
227
255
|
def __init__(self, message: str, status_code: Optional[int] = None, response_text: Optional[str] = None, **kwargs):
|
|
228
|
-
|
|
229
|
-
context.update({"status_code": status_code, "response_text": response_text})
|
|
230
|
-
kwargs["context"] = context
|
|
256
|
+
_merge_context(kwargs, {"status_code": status_code, "response_text": response_text})
|
|
231
257
|
super().__init__(message, **kwargs)
|
|
232
258
|
|
|
233
259
|
|
|
@@ -249,14 +275,13 @@ class QuotaExceededError(APIError):
|
|
|
249
275
|
def handle_exception(func):
|
|
250
276
|
"""Decorator to handle exceptions and convert them to LattifAI errors."""
|
|
251
277
|
|
|
278
|
+
@functools.wraps(func)
|
|
252
279
|
def wrapper(*args, **kwargs):
|
|
253
280
|
try:
|
|
254
281
|
return func(*args, **kwargs)
|
|
255
282
|
except LattifAIError:
|
|
256
|
-
# Re-raise LattifAI errors as-is
|
|
257
283
|
raise
|
|
258
284
|
except Exception as e:
|
|
259
|
-
# Convert other exceptions to LattifAI errors
|
|
260
285
|
error_msg = f"Unexpected error in {func.__name__}: {str(e)}"
|
|
261
286
|
context = {
|
|
262
287
|
"function": func.__name__,
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Audio Event Detection module for LattifAI.
|
|
2
|
+
|
|
3
|
+
This module provides audio event detection capabilities, it can identify various
|
|
4
|
+
audio events including speech, music, singing, and demographic characteristics
|
|
5
|
+
(male, female, child voices).
|
|
6
|
+
|
|
7
|
+
Key Components:
|
|
8
|
+
LattifAIEventDetector: Main class that wraps lattifai_core's
|
|
9
|
+
EventDetector for seamless integration with LattifAI workflows.
|
|
10
|
+
|
|
11
|
+
Features:
|
|
12
|
+
- Multi-class audio event detection (30+ reduced classes or 400+ full classes)
|
|
13
|
+
- Voice Activity Detection (VAD) for speech segmentation
|
|
14
|
+
- Gender/age classification for speech segments
|
|
15
|
+
- Configurable detection thresholds and top-k filtering
|
|
16
|
+
- Support for both bundled and custom pretrained models
|
|
17
|
+
|
|
18
|
+
Detected Event Types:
|
|
19
|
+
- Speech: General speech activity
|
|
20
|
+
- Male/Female/Child: Speaker demographic classification
|
|
21
|
+
- Music: Musical content detection
|
|
22
|
+
- Singing: Vocal music detection
|
|
23
|
+
- Synthetic: Synthetic/electronic sounds
|
|
24
|
+
|
|
25
|
+
Configuration:
|
|
26
|
+
Use EventConfig to control:
|
|
27
|
+
- enabled: Whether to run audio event detection
|
|
28
|
+
- device: GPU/CPU device selection
|
|
29
|
+
- dtype: Model precision (float32, float16, bfloat16)
|
|
30
|
+
- reduced: Use reduced label set (33 vs 400+ classes)
|
|
31
|
+
- top_k: Number of top event classes to detect
|
|
32
|
+
- vad_chunk_size/vad_max_gap: VAD segmentation parameters
|
|
33
|
+
|
|
34
|
+
Example:
|
|
35
|
+
>>> from lattifai.event import LattifAIEventDetector
|
|
36
|
+
>>> from lattifai.config import EventConfig
|
|
37
|
+
>>> from lattifai.audio2 import AudioLoader
|
|
38
|
+
>>>
|
|
39
|
+
>>> config = EventConfig(enabled=True, device="cuda")
|
|
40
|
+
>>> detector = LattifAIEventDetector(config)
|
|
41
|
+
>>>
|
|
42
|
+
>>> audio = AudioLoader.load("speech.wav")
|
|
43
|
+
>>> result = detector.detect(audio)
|
|
44
|
+
>>>
|
|
45
|
+
>>> # Access VAD segments directly
|
|
46
|
+
>>> for start, end in result.vad_segments:
|
|
47
|
+
... print(f"Speech: {start:.2f} - {end:.2f}")
|
|
48
|
+
>>>
|
|
49
|
+
>>> # Or access the full TextGrid
|
|
50
|
+
>>> print(result.audio_events)
|
|
51
|
+
|
|
52
|
+
Performance Notes:
|
|
53
|
+
- GPU acceleration provides significant speedup (10x+ over CPU)
|
|
54
|
+
- Use dtype="float16" for faster inference with minimal accuracy loss
|
|
55
|
+
- fast_mode=True reduces computation by only detecting top_k classes
|
|
56
|
+
- Long audio files are automatically chunked to manage memory
|
|
57
|
+
|
|
58
|
+
See Also:
|
|
59
|
+
- lattifai.config.EventConfig: Configuration options
|
|
60
|
+
- lattifai_core.event: Core event detection implementation
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
from .lattifai import LattifAIEventDetector
|
|
64
|
+
|
|
65
|
+
__all__ = ["LattifAIEventDetector"]
|