lattifai 1.2.2__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/_init.py +20 -0
- lattifai/alignment/__init__.py +2 -3
- lattifai/alignment/lattice1_aligner.py +117 -4
- lattifai/alignment/lattice1_worker.py +47 -4
- lattifai/alignment/segmenter.py +3 -2
- lattifai/alignment/text_align.py +2 -1
- lattifai/alignment/tokenizer.py +56 -29
- lattifai/audio2.py +162 -183
- lattifai/cli/alignment.py +5 -0
- lattifai/cli/caption.py +6 -6
- lattifai/cli/transcribe.py +1 -5
- lattifai/cli/youtube.py +3 -0
- lattifai/client.py +41 -12
- lattifai/config/__init__.py +21 -3
- lattifai/config/alignment.py +7 -0
- lattifai/config/caption.py +13 -243
- lattifai/config/client.py +16 -0
- lattifai/config/event.py +102 -0
- lattifai/config/transcription.py +25 -1
- lattifai/data/__init__.py +8 -0
- lattifai/data/caption.py +228 -0
- lattifai/errors.py +78 -53
- lattifai/event/__init__.py +65 -0
- lattifai/event/lattifai.py +166 -0
- lattifai/mixin.py +22 -17
- lattifai/transcription/base.py +2 -1
- lattifai/transcription/gemini.py +147 -16
- lattifai/transcription/lattifai.py +8 -11
- lattifai/types.py +1 -1
- lattifai/youtube/client.py +143 -48
- {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/METADATA +117 -54
- lattifai-1.3.0.dist-info/RECORD +57 -0
- lattifai/__init__.py +0 -88
- lattifai/alignment/sentence_splitter.py +0 -350
- lattifai/caption/__init__.py +0 -96
- lattifai/caption/caption.py +0 -661
- lattifai/caption/formats/__init__.py +0 -199
- lattifai/caption/formats/base.py +0 -211
- lattifai/caption/formats/gemini.py +0 -722
- lattifai/caption/formats/json.py +0 -194
- lattifai/caption/formats/lrc.py +0 -309
- lattifai/caption/formats/nle/__init__.py +0 -9
- lattifai/caption/formats/nle/audition.py +0 -561
- lattifai/caption/formats/nle/avid.py +0 -423
- lattifai/caption/formats/nle/fcpxml.py +0 -549
- lattifai/caption/formats/nle/premiere.py +0 -589
- lattifai/caption/formats/pysubs2.py +0 -642
- lattifai/caption/formats/sbv.py +0 -147
- lattifai/caption/formats/tabular.py +0 -338
- lattifai/caption/formats/textgrid.py +0 -193
- lattifai/caption/formats/ttml.py +0 -652
- lattifai/caption/formats/vtt.py +0 -469
- lattifai/caption/parsers/__init__.py +0 -9
- lattifai/caption/parsers/text_parser.py +0 -147
- lattifai/caption/standardize.py +0 -636
- lattifai/caption/supervision.py +0 -34
- lattifai/caption/utils.py +0 -474
- lattifai-1.2.2.dist-info/RECORD +0 -76
- {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/WHEEL +0 -0
- {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/entry_points.txt +0 -0
- {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/top_level.txt +0 -0
lattifai/config/__init__.py
CHANGED
|
@@ -1,19 +1,31 @@
|
|
|
1
1
|
"""Configuration system for LattifAI using nemo_run."""
|
|
2
2
|
|
|
3
|
-
from
|
|
4
|
-
from .caption import (
|
|
5
|
-
|
|
3
|
+
# Re-export caption config classes from lattifai-captions package
|
|
4
|
+
from lattifai.caption.config import (
|
|
5
|
+
ALL_CAPTION_FORMATS,
|
|
6
|
+
CAPTION_FORMATS,
|
|
7
|
+
INPUT_CAPTION_FORMATS,
|
|
8
|
+
OUTPUT_CAPTION_FORMATS,
|
|
6
9
|
CaptionFonts,
|
|
7
10
|
CaptionStyle,
|
|
11
|
+
InputCaptionFormat,
|
|
8
12
|
KaraokeConfig,
|
|
13
|
+
OutputCaptionFormat,
|
|
9
14
|
StandardizationConfig,
|
|
10
15
|
)
|
|
16
|
+
|
|
17
|
+
from .alignment import AlignmentConfig
|
|
18
|
+
|
|
19
|
+
# CaptionConfig is defined in lattifai-python (workflow config)
|
|
20
|
+
from .caption import CaptionConfig
|
|
11
21
|
from .client import ClientConfig
|
|
12
22
|
from .diarization import DiarizationConfig
|
|
23
|
+
from .event import EventConfig
|
|
13
24
|
from .media import AUDIO_FORMATS, MEDIA_FORMATS, VIDEO_FORMATS, MediaConfig
|
|
14
25
|
from .transcription import TranscriptionConfig
|
|
15
26
|
|
|
16
27
|
__all__ = [
|
|
28
|
+
"EventConfig",
|
|
17
29
|
"ClientConfig",
|
|
18
30
|
"AlignmentConfig",
|
|
19
31
|
"CaptionConfig",
|
|
@@ -21,6 +33,12 @@ __all__ = [
|
|
|
21
33
|
"CaptionStyle",
|
|
22
34
|
"KaraokeConfig",
|
|
23
35
|
"StandardizationConfig",
|
|
36
|
+
"InputCaptionFormat",
|
|
37
|
+
"OutputCaptionFormat",
|
|
38
|
+
"INPUT_CAPTION_FORMATS",
|
|
39
|
+
"OUTPUT_CAPTION_FORMATS",
|
|
40
|
+
"ALL_CAPTION_FORMATS",
|
|
41
|
+
"CAPTION_FORMATS",
|
|
24
42
|
"TranscriptionConfig",
|
|
25
43
|
"DiarizationConfig",
|
|
26
44
|
"MediaConfig",
|
lattifai/config/alignment.py
CHANGED
|
@@ -100,6 +100,13 @@ class AlignmentConfig:
|
|
|
100
100
|
Default: 5.0. Typical range: 0.0-10.0.
|
|
101
101
|
"""
|
|
102
102
|
|
|
103
|
+
transition_penalty: float = 0.0
|
|
104
|
+
"""Penalty for token transitions in the decoding graph to discourage duration=1 tokens.
|
|
105
|
+
A negative value penalizes transitions (moving to next token), making the model prefer
|
|
106
|
+
self-loops (staying on current token longer). This helps prevent spurious short-duration alignments.
|
|
107
|
+
Default: 0.0 (no penalty). Typical range: -1.0 to 0.0 (e.g., -0.5).
|
|
108
|
+
"""
|
|
109
|
+
|
|
103
110
|
client_wrapper: Optional["SyncAPIClient"] = field(default=None, repr=False)
|
|
104
111
|
"""Reference to the SyncAPIClient instance. Auto-set during client initialization."""
|
|
105
112
|
|
lattifai/config/caption.py
CHANGED
|
@@ -1,248 +1,18 @@
|
|
|
1
|
-
"""Caption I/O configuration for LattifAI."""
|
|
1
|
+
"""Caption I/O configuration for LattifAI SDK."""
|
|
2
2
|
|
|
3
|
-
from dataclasses import dataclass
|
|
3
|
+
from dataclasses import dataclass
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import
|
|
6
|
-
|
|
7
|
-
from
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
These are reference constants for popular fonts. You can use any
|
|
18
|
-
system font name as the font_name parameter in CaptionStyle.
|
|
19
|
-
"""
|
|
20
|
-
|
|
21
|
-
# Western fonts
|
|
22
|
-
ARIAL = "Arial"
|
|
23
|
-
IMPACT = "Impact"
|
|
24
|
-
VERDANA = "Verdana"
|
|
25
|
-
HELVETICA = "Helvetica"
|
|
26
|
-
|
|
27
|
-
# Chinese fonts
|
|
28
|
-
NOTO_SANS_SC = "Noto Sans SC"
|
|
29
|
-
MICROSOFT_YAHEI = "Microsoft YaHei"
|
|
30
|
-
PINGFANG_SC = "PingFang SC"
|
|
31
|
-
SIMHEI = "SimHei"
|
|
32
|
-
|
|
33
|
-
# Japanese fonts
|
|
34
|
-
NOTO_SANS_JP = "Noto Sans JP"
|
|
35
|
-
MEIRYO = "Meiryo"
|
|
36
|
-
HIRAGINO_SANS = "Hiragino Sans"
|
|
37
|
-
|
|
38
|
-
# Korean fonts
|
|
39
|
-
NOTO_SANS_KR = "Noto Sans KR"
|
|
40
|
-
MALGUN_GOTHIC = "Malgun Gothic"
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
@dataclass
|
|
44
|
-
class CaptionStyle:
|
|
45
|
-
"""Caption style configuration for ASS/TTML formats.
|
|
46
|
-
|
|
47
|
-
Attributes:
|
|
48
|
-
primary_color: Main text color (#RRGGBB)
|
|
49
|
-
secondary_color: Secondary/highlight color (#RRGGBB)
|
|
50
|
-
outline_color: Text outline color (#RRGGBB)
|
|
51
|
-
back_color: Shadow color (#RRGGBB)
|
|
52
|
-
font_name: Font family name (use CaptionFonts constants or any system font)
|
|
53
|
-
font_size: Font size in points
|
|
54
|
-
bold: Enable bold text
|
|
55
|
-
italic: Enable italic text
|
|
56
|
-
outline_width: Outline thickness
|
|
57
|
-
shadow_depth: Shadow distance
|
|
58
|
-
alignment: ASS alignment (1-9, numpad style), 2=bottom-center
|
|
59
|
-
margin_l: Left margin in pixels
|
|
60
|
-
margin_r: Right margin in pixels
|
|
61
|
-
margin_v: Vertical margin in pixels
|
|
62
|
-
"""
|
|
63
|
-
|
|
64
|
-
# Colors (#RRGGBB format)
|
|
65
|
-
primary_color: str = "#FFFFFF"
|
|
66
|
-
secondary_color: str = "#00FFFF"
|
|
67
|
-
outline_color: str = "#000000"
|
|
68
|
-
back_color: str = "#000000"
|
|
69
|
-
|
|
70
|
-
# Font
|
|
71
|
-
font_name: str = CaptionFonts.ARIAL
|
|
72
|
-
font_size: int = 48
|
|
73
|
-
bold: bool = False
|
|
74
|
-
italic: bool = False
|
|
75
|
-
|
|
76
|
-
# Border and shadow
|
|
77
|
-
outline_width: float = 2.0
|
|
78
|
-
shadow_depth: float = 1.0
|
|
79
|
-
|
|
80
|
-
# Position
|
|
81
|
-
alignment: int = 2
|
|
82
|
-
margin_l: int = 20
|
|
83
|
-
margin_r: int = 20
|
|
84
|
-
margin_v: int = 20
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
@dataclass
|
|
88
|
-
class KaraokeConfig:
|
|
89
|
-
"""Karaoke export configuration.
|
|
90
|
-
|
|
91
|
-
Attributes:
|
|
92
|
-
enabled: Whether karaoke mode is enabled
|
|
93
|
-
effect: Karaoke effect type
|
|
94
|
-
- "sweep": Gradual fill from left to right (ASS \\kf tag)
|
|
95
|
-
- "instant": Instant highlight (ASS \\k tag)
|
|
96
|
-
- "outline": Outline then fill (ASS \\ko tag)
|
|
97
|
-
style: Caption style configuration (font, colors, position)
|
|
98
|
-
lrc_precision: LRC time precision ("centisecond" or "millisecond")
|
|
99
|
-
lrc_metadata: LRC metadata dict (ar, ti, al, etc.)
|
|
100
|
-
ttml_timing_mode: TTML timing attribute ("Word" or "Line")
|
|
101
|
-
"""
|
|
102
|
-
|
|
103
|
-
enabled: bool = False
|
|
104
|
-
effect: Literal["sweep", "instant", "outline"] = "sweep"
|
|
105
|
-
style: CaptionStyle = field(default_factory=CaptionStyle)
|
|
106
|
-
|
|
107
|
-
# LRC specific
|
|
108
|
-
lrc_precision: Literal["centisecond", "millisecond"] = "millisecond"
|
|
109
|
-
lrc_metadata: Dict[str, str] = field(default_factory=dict)
|
|
110
|
-
|
|
111
|
-
# TTML specific
|
|
112
|
-
ttml_timing_mode: Literal["Word", "Line"] = "Word"
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
@dataclass
|
|
116
|
-
class StandardizationConfig:
|
|
117
|
-
"""Caption standardization configuration following broadcast guidelines.
|
|
118
|
-
|
|
119
|
-
Reference Standards:
|
|
120
|
-
- Netflix Timed Text Style Guide
|
|
121
|
-
- BBC Subtitle Guidelines
|
|
122
|
-
- EBU-TT-D Standard
|
|
123
|
-
|
|
124
|
-
Attributes:
|
|
125
|
-
min_duration: Minimum segment duration (seconds). Netflix recommends 5/6s, BBC 0.3s
|
|
126
|
-
max_duration: Maximum segment duration (seconds). Netflix/BBC recommends 7s
|
|
127
|
-
min_gap: Minimum gap between segments (seconds). 80ms prevents subtitle flicker
|
|
128
|
-
max_lines: Maximum lines per segment. Broadcast standard is typically 2
|
|
129
|
-
max_chars_per_line: Maximum characters per line. CJK auto-adjusted by ÷2 (e.g., 42 → 21)
|
|
130
|
-
optimal_cps: Optimal reading speed (chars/sec). Netflix recommends 17-20 CPS
|
|
131
|
-
start_margin: Start margin (seconds) before first word. None = no adjustment (default)
|
|
132
|
-
end_margin: End margin (seconds) after last word. None = no adjustment (default)
|
|
133
|
-
margin_collision_mode: How to handle collisions: 'trim' (reduce margin) or 'gap' (maintain min_gap)
|
|
134
|
-
"""
|
|
135
|
-
|
|
136
|
-
min_duration: float = 0.8
|
|
137
|
-
max_duration: float = 7.0
|
|
138
|
-
min_gap: float = 0.08
|
|
139
|
-
max_lines: int = 2
|
|
140
|
-
max_chars_per_line: int = 42
|
|
141
|
-
optimal_cps: float = 17.0
|
|
142
|
-
start_margin: Optional[float] = None
|
|
143
|
-
end_margin: Optional[float] = None
|
|
144
|
-
margin_collision_mode: Literal["trim", "gap"] = "trim"
|
|
145
|
-
|
|
146
|
-
def __post_init__(self):
|
|
147
|
-
"""Validate configuration parameters."""
|
|
148
|
-
if self.min_duration <= 0:
|
|
149
|
-
raise ValueError("min_duration must be positive")
|
|
150
|
-
if self.max_duration <= self.min_duration:
|
|
151
|
-
raise ValueError("max_duration must be greater than min_duration")
|
|
152
|
-
if self.min_gap < 0:
|
|
153
|
-
raise ValueError("min_gap cannot be negative")
|
|
154
|
-
if self.max_lines < 1:
|
|
155
|
-
raise ValueError("max_lines must be at least 1")
|
|
156
|
-
if self.max_chars_per_line < 10:
|
|
157
|
-
raise ValueError("max_chars_per_line must be at least 10")
|
|
158
|
-
if self.start_margin is not None and self.start_margin < 0:
|
|
159
|
-
raise ValueError("start_margin cannot be negative")
|
|
160
|
-
if self.end_margin is not None and self.end_margin < 0:
|
|
161
|
-
raise ValueError("end_margin cannot be negative")
|
|
162
|
-
if self.margin_collision_mode not in ("trim", "gap"):
|
|
163
|
-
raise ValueError("margin_collision_mode must be 'trim' or 'gap'")
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
# =============================================================================
|
|
167
|
-
# Format Type Definitions (Single Source of Truth)
|
|
168
|
-
# =============================================================================
|
|
169
|
-
|
|
170
|
-
# Type alias for input caption formats (all formats with registered readers)
|
|
171
|
-
InputCaptionFormat = Literal[
|
|
172
|
-
# Standard subtitle formats
|
|
173
|
-
"srt",
|
|
174
|
-
"vtt", # WebVTT (auto-detects YouTube VTT with word-level timestamps)
|
|
175
|
-
"ass",
|
|
176
|
-
"ssa",
|
|
177
|
-
"sub",
|
|
178
|
-
"sbv",
|
|
179
|
-
"txt",
|
|
180
|
-
"sami",
|
|
181
|
-
"smi",
|
|
182
|
-
# Tabular formats
|
|
183
|
-
"csv",
|
|
184
|
-
"tsv",
|
|
185
|
-
"aud",
|
|
186
|
-
"json",
|
|
187
|
-
# Specialized formats
|
|
188
|
-
"textgrid", # Praat TextGrid
|
|
189
|
-
"gemini", # Gemini/YouTube transcript format
|
|
190
|
-
# Professional NLE formats
|
|
191
|
-
"avid_ds",
|
|
192
|
-
"fcpxml",
|
|
193
|
-
"premiere_xml",
|
|
194
|
-
"audition_csv",
|
|
195
|
-
# Special
|
|
196
|
-
"auto", # Auto-detect format
|
|
197
|
-
]
|
|
198
|
-
|
|
199
|
-
# Type alias for output caption formats (all formats with registered writers)
|
|
200
|
-
OutputCaptionFormat = Literal[
|
|
201
|
-
# Standard subtitle formats
|
|
202
|
-
"srt",
|
|
203
|
-
"vtt", # WebVTT (use karaoke_config.enabled=True for YouTube VTT style output)
|
|
204
|
-
"ass",
|
|
205
|
-
"ssa",
|
|
206
|
-
"sub",
|
|
207
|
-
"sbv",
|
|
208
|
-
"txt",
|
|
209
|
-
"sami",
|
|
210
|
-
"smi",
|
|
211
|
-
# Tabular formats
|
|
212
|
-
"csv",
|
|
213
|
-
"tsv",
|
|
214
|
-
"aud",
|
|
215
|
-
"json",
|
|
216
|
-
# Specialized formats
|
|
217
|
-
"textgrid", # Praat TextGrid
|
|
218
|
-
"gemini", # Gemini/YouTube transcript format
|
|
219
|
-
# TTML profiles (write-only)
|
|
220
|
-
"ttml", # Generic TTML
|
|
221
|
-
"imsc1", # IMSC1 (Netflix/streaming) TTML profile
|
|
222
|
-
"ebu_tt_d", # EBU-TT-D (European broadcast) TTML profile
|
|
223
|
-
# Professional NLE formats
|
|
224
|
-
"avid_ds", # Avid Media Composer SubCap format
|
|
225
|
-
"fcpxml", # Final Cut Pro XML
|
|
226
|
-
"premiere_xml", # Adobe Premiere Pro XML (graphic clips)
|
|
227
|
-
"audition_csv", # Adobe Audition markers
|
|
228
|
-
"edimarker_csv", # Pro Tools (via EdiMarker) markers
|
|
229
|
-
]
|
|
230
|
-
|
|
231
|
-
# =============================================================================
|
|
232
|
-
# Runtime Format Lists (Derived from Type Definitions)
|
|
233
|
-
# =============================================================================
|
|
234
|
-
|
|
235
|
-
# Input caption formats list (derived from InputCaptionFormat)
|
|
236
|
-
INPUT_CAPTION_FORMATS: list[str] = list(get_args(InputCaptionFormat))
|
|
237
|
-
|
|
238
|
-
# Output caption formats list (derived from OutputCaptionFormat)
|
|
239
|
-
OUTPUT_CAPTION_FORMATS: list[str] = list(get_args(OutputCaptionFormat))
|
|
240
|
-
|
|
241
|
-
# Standard caption formats (formats with both reader and writer)
|
|
242
|
-
CAPTION_FORMATS: list[str] = ["srt", "vtt", "ass", "ssa", "sub", "sbv", "txt", "sami", "smi"]
|
|
243
|
-
|
|
244
|
-
# All caption formats combined (for file detection, excludes "auto")
|
|
245
|
-
ALL_CAPTION_FORMATS: list[str] = list(set(INPUT_CAPTION_FORMATS + OUTPUT_CAPTION_FORMATS) - {"auto"})
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from lattifai.caption.config import (
|
|
8
|
+
INPUT_CAPTION_FORMATS,
|
|
9
|
+
OUTPUT_CAPTION_FORMATS,
|
|
10
|
+
InputCaptionFormat,
|
|
11
|
+
KaraokeConfig,
|
|
12
|
+
OutputCaptionFormat,
|
|
13
|
+
StandardizationConfig,
|
|
14
|
+
)
|
|
15
|
+
from lattifai.caption.supervision import Pathlike
|
|
246
16
|
|
|
247
17
|
|
|
248
18
|
@dataclass
|
lattifai/config/client.py
CHANGED
|
@@ -31,6 +31,13 @@ class ClientConfig:
|
|
|
31
31
|
When True, prints detailed timing information for various stages of the process.
|
|
32
32
|
"""
|
|
33
33
|
|
|
34
|
+
# Client identification for usage tracking
|
|
35
|
+
client_name: Optional[str] = field(default="python-sdk")
|
|
36
|
+
"""Client identifier for usage tracking (e.g., 'python-sdk', 'claude-plugin')."""
|
|
37
|
+
|
|
38
|
+
client_version: Optional[str] = field(default=None)
|
|
39
|
+
"""Client version for usage tracking. If None, uses lattifai package version."""
|
|
40
|
+
|
|
34
41
|
def __post_init__(self):
|
|
35
42
|
"""Validate and auto-populate configuration after initialization."""
|
|
36
43
|
|
|
@@ -44,6 +51,15 @@ class ClientConfig:
|
|
|
44
51
|
if self.api_key is None:
|
|
45
52
|
object.__setattr__(self, "api_key", os.environ.get("LATTIFAI_API_KEY"))
|
|
46
53
|
|
|
54
|
+
# Auto-load client version from package if not provided
|
|
55
|
+
if self.client_version is None:
|
|
56
|
+
try:
|
|
57
|
+
from importlib.metadata import version
|
|
58
|
+
|
|
59
|
+
object.__setattr__(self, "client_version", version("lattifai"))
|
|
60
|
+
except Exception:
|
|
61
|
+
object.__setattr__(self, "client_version", "unknown")
|
|
62
|
+
|
|
47
63
|
# Validate API parameters
|
|
48
64
|
if self.timeout <= 0:
|
|
49
65
|
raise ValueError("timeout must be greater than 0")
|
lattifai/config/event.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""Audio Event Detection configuration for LattifAI."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import TYPE_CHECKING, Dict, List, Literal, Optional
|
|
5
|
+
|
|
6
|
+
from ..utils import _select_device
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from ..client import SyncAPIClient
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class EventConfig:
|
|
14
|
+
"""
|
|
15
|
+
Audio Event Detection configuration.
|
|
16
|
+
|
|
17
|
+
Settings for detecting audio events (Speech, Music, Male, Female...) in audio files using the AED model.
|
|
18
|
+
|
|
19
|
+
Event Matching:
|
|
20
|
+
When event_matching is enabled, the AED system will:
|
|
21
|
+
1. Parse [Event] markers from input captions (e.g., [Music], [Applause])
|
|
22
|
+
2. Match caption events to AED labels using semantic matching
|
|
23
|
+
3. Force detection of matched labels even if not in top_k
|
|
24
|
+
4. Update caption timestamps based on AED detection results
|
|
25
|
+
|
|
26
|
+
Event matching logic is implemented in lattifai_core.event.EventMatcher.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
enabled: bool = False
|
|
30
|
+
"""Enable audio event detection."""
|
|
31
|
+
|
|
32
|
+
device: Literal["cpu", "cuda", "mps", "auto"] = "auto"
|
|
33
|
+
"""Computation device for Event Detection models."""
|
|
34
|
+
|
|
35
|
+
vad_chunk_size: float = 30.0
|
|
36
|
+
"""VAD chunk size in seconds for speech segmentation."""
|
|
37
|
+
|
|
38
|
+
vad_max_gap: float = 2.0
|
|
39
|
+
"""Maximum gap in seconds between VAD segments to merge."""
|
|
40
|
+
|
|
41
|
+
fast_mode: bool = True
|
|
42
|
+
"""Enable fast mode (only detect top_k classes, skip others)."""
|
|
43
|
+
|
|
44
|
+
model_path: str = ""
|
|
45
|
+
"""Path to pretrained model. If empty, uses default bundled model."""
|
|
46
|
+
|
|
47
|
+
event_matching: bool = True
|
|
48
|
+
"""Whether update events in the alignment"""
|
|
49
|
+
|
|
50
|
+
extra_events: List[str] = field(default_factory=list)
|
|
51
|
+
"""Additional event types to always detect, even if not in top_k.
|
|
52
|
+
Example: ["Applause", "Laughter", "Music"]
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
event_aliases: Dict[str, List[str]] = field(default_factory=dict)
|
|
56
|
+
"""Custom aliases mapping [Event] markers to AED labels.
|
|
57
|
+
|
|
58
|
+
Core AED labels (14 types):
|
|
59
|
+
[Applause], [Baby cry], [Battle cry], [Bellow], [Children shouting],
|
|
60
|
+
[Laughter], [Music], [Shout], [Singing], [Sound effect],
|
|
61
|
+
[Speech], [Whoop], [Yell]
|
|
62
|
+
|
|
63
|
+
Custom aliases extend built-ins (not replace):
|
|
64
|
+
{"[Audience reaction]": ["[Applause]", "[Cheering]"]}
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
time_tolerance: float = 20.0
|
|
68
|
+
"""Max time (seconds) non-Speech events can extend beyond supervision boundaries."""
|
|
69
|
+
|
|
70
|
+
update_timestamps: bool = True
|
|
71
|
+
"""Whether to update caption event timestamps based on AED detections."""
|
|
72
|
+
|
|
73
|
+
duplicate_strategy: Literal["keep_all", "merge_first", "split"] = "merge_first"
|
|
74
|
+
"""Strategy for handling multiple [Event] markers mapped to same AED interval.
|
|
75
|
+
- keep_all: Update all events to same time range (may cause overlapping)
|
|
76
|
+
- merge_first: Keep only first event per interval, skip duplicates
|
|
77
|
+
- split: Split interval at speech boundaries (not yet implemented)
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
client_wrapper: Optional["SyncAPIClient"] = field(default=None, repr=False)
|
|
81
|
+
"""Reference to the SyncAPIClient instance. Auto-set during client initialization."""
|
|
82
|
+
|
|
83
|
+
def __post_init__(self):
|
|
84
|
+
"""Validate and auto-populate configuration after initialization."""
|
|
85
|
+
# Validate device
|
|
86
|
+
if self.device not in ("cpu", "cuda", "mps", "auto") and not self.device.startswith("cuda:"):
|
|
87
|
+
raise ValueError(f"device must be one of ('cpu', 'cuda', 'mps', 'auto'), got '{self.device}'")
|
|
88
|
+
|
|
89
|
+
if self.device == "auto":
|
|
90
|
+
self.device = _select_device(self.device)
|
|
91
|
+
|
|
92
|
+
# Validate vad_chunk_size
|
|
93
|
+
if self.vad_chunk_size < 0:
|
|
94
|
+
raise ValueError("vad_chunk_size must be non-negative")
|
|
95
|
+
|
|
96
|
+
# Validate vad_max_gap
|
|
97
|
+
if self.vad_max_gap < 0:
|
|
98
|
+
raise ValueError("vad_max_gap must be non-negative")
|
|
99
|
+
|
|
100
|
+
# Validate time_tolerance
|
|
101
|
+
if self.time_tolerance < 0:
|
|
102
|
+
raise ValueError("time_tolerance must be non-negative")
|
lattifai/config/transcription.py
CHANGED
|
@@ -7,7 +7,7 @@ from typing import TYPE_CHECKING, Literal, Optional
|
|
|
7
7
|
from ..utils import _select_device
|
|
8
8
|
|
|
9
9
|
if TYPE_CHECKING:
|
|
10
|
-
from ..
|
|
10
|
+
from ..client import SyncAPIClient
|
|
11
11
|
|
|
12
12
|
SUPPORTED_TRANSCRIPTION_MODELS = Literal[
|
|
13
13
|
"gemini-2.5-pro",
|
|
@@ -48,6 +48,30 @@ class TranscriptionConfig:
|
|
|
48
48
|
language: Optional[str] = None
|
|
49
49
|
"""Target language code for transcription (e.g., 'en', 'zh', 'ja')."""
|
|
50
50
|
|
|
51
|
+
prompt: Optional[str] = None
|
|
52
|
+
"""Custom prompt text or path to prompt file for transcription.
|
|
53
|
+
If the value is an existing file path, the file contents will be used.
|
|
54
|
+
Otherwise, the value is used directly as the prompt text."""
|
|
55
|
+
|
|
56
|
+
description: Optional[str] = None
|
|
57
|
+
"""Media description from platforms like YouTube, Xiaoyuzhou (小宇宙), etc.
|
|
58
|
+
Used to provide context for transcription."""
|
|
59
|
+
|
|
60
|
+
thinking: bool = True
|
|
61
|
+
"""Enable Gemini's thinking mode (Gemini models only). Set to False to disable thinking."""
|
|
62
|
+
|
|
63
|
+
include_thoughts: bool = False
|
|
64
|
+
"""Include Gemini's thinking process in the output (Gemini models only). Requires thinking=True."""
|
|
65
|
+
|
|
66
|
+
temperature: Optional[float] = None
|
|
67
|
+
"""Sampling temperature for generation. Higher values increase randomness."""
|
|
68
|
+
|
|
69
|
+
top_k: Optional[float] = None
|
|
70
|
+
"""Top-k sampling parameter. Limits token selection to top k candidates."""
|
|
71
|
+
|
|
72
|
+
top_p: Optional[float] = None
|
|
73
|
+
"""Nucleus sampling parameter. Limits token selection by cumulative probability."""
|
|
74
|
+
|
|
51
75
|
lattice_model_path: Optional[str] = None
|
|
52
76
|
"""Path to local LattifAI model. Will be auto-set in LattifAI client."""
|
|
53
77
|
|