lattifai 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/_init.py +20 -0
- lattifai/alignment/__init__.py +9 -1
- lattifai/alignment/lattice1_aligner.py +175 -54
- lattifai/alignment/lattice1_worker.py +47 -4
- lattifai/alignment/punctuation.py +38 -0
- lattifai/alignment/segmenter.py +3 -2
- lattifai/alignment/text_align.py +441 -0
- lattifai/alignment/tokenizer.py +134 -65
- lattifai/audio2.py +162 -183
- lattifai/cli/__init__.py +2 -1
- lattifai/cli/alignment.py +5 -0
- lattifai/cli/caption.py +111 -4
- lattifai/cli/transcribe.py +2 -6
- lattifai/cli/youtube.py +7 -1
- lattifai/client.py +72 -123
- lattifai/config/__init__.py +28 -0
- lattifai/config/alignment.py +14 -0
- lattifai/config/caption.py +45 -31
- lattifai/config/client.py +16 -0
- lattifai/config/event.py +102 -0
- lattifai/config/media.py +20 -0
- lattifai/config/transcription.py +25 -1
- lattifai/data/__init__.py +8 -0
- lattifai/data/caption.py +228 -0
- lattifai/diarization/__init__.py +41 -1
- lattifai/errors.py +78 -53
- lattifai/event/__init__.py +65 -0
- lattifai/event/lattifai.py +166 -0
- lattifai/mixin.py +49 -32
- lattifai/transcription/base.py +8 -2
- lattifai/transcription/gemini.py +147 -16
- lattifai/transcription/lattifai.py +25 -63
- lattifai/types.py +1 -1
- lattifai/utils.py +7 -13
- lattifai/workflow/__init__.py +28 -4
- lattifai/workflow/file_manager.py +2 -5
- lattifai/youtube/__init__.py +43 -0
- lattifai/youtube/client.py +1265 -0
- lattifai/youtube/types.py +23 -0
- lattifai-1.3.0.dist-info/METADATA +678 -0
- lattifai-1.3.0.dist-info/RECORD +57 -0
- {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/entry_points.txt +1 -2
- lattifai/__init__.py +0 -88
- lattifai/alignment/sentence_splitter.py +0 -219
- lattifai/caption/__init__.py +0 -20
- lattifai/caption/caption.py +0 -1467
- lattifai/caption/gemini_reader.py +0 -462
- lattifai/caption/gemini_writer.py +0 -173
- lattifai/caption/supervision.py +0 -34
- lattifai/caption/text_parser.py +0 -145
- lattifai/cli/app_installer.py +0 -142
- lattifai/cli/server.py +0 -44
- lattifai/server/app.py +0 -427
- lattifai/workflow/youtube.py +0 -577
- lattifai-1.2.1.dist-info/METADATA +0 -1134
- lattifai-1.2.1.dist-info/RECORD +0 -58
- {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/WHEEL +0 -0
- {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/top_level.txt +0 -0
lattifai/config/event.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""Audio Event Detection configuration for LattifAI."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import TYPE_CHECKING, Dict, List, Literal, Optional
|
|
5
|
+
|
|
6
|
+
from ..utils import _select_device
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from ..client import SyncAPIClient
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class EventConfig:
|
|
14
|
+
"""
|
|
15
|
+
Audio Event Detection configuration.
|
|
16
|
+
|
|
17
|
+
Settings for detecting audio events (Speech, Music, Male, Female...) in audio files using the AED model.
|
|
18
|
+
|
|
19
|
+
Event Matching:
|
|
20
|
+
When event_matching is enabled, the AED system will:
|
|
21
|
+
1. Parse [Event] markers from input captions (e.g., [Music], [Applause])
|
|
22
|
+
2. Match caption events to AED labels using semantic matching
|
|
23
|
+
3. Force detection of matched labels even if not in top_k
|
|
24
|
+
4. Update caption timestamps based on AED detection results
|
|
25
|
+
|
|
26
|
+
Event matching logic is implemented in lattifai_core.event.EventMatcher.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
enabled: bool = False
|
|
30
|
+
"""Enable audio event detection."""
|
|
31
|
+
|
|
32
|
+
device: Literal["cpu", "cuda", "mps", "auto"] = "auto"
|
|
33
|
+
"""Computation device for Event Detection models."""
|
|
34
|
+
|
|
35
|
+
vad_chunk_size: float = 30.0
|
|
36
|
+
"""VAD chunk size in seconds for speech segmentation."""
|
|
37
|
+
|
|
38
|
+
vad_max_gap: float = 2.0
|
|
39
|
+
"""Maximum gap in seconds between VAD segments to merge."""
|
|
40
|
+
|
|
41
|
+
fast_mode: bool = True
|
|
42
|
+
"""Enable fast mode (only detect top_k classes, skip others)."""
|
|
43
|
+
|
|
44
|
+
model_path: str = ""
|
|
45
|
+
"""Path to pretrained model. If empty, uses default bundled model."""
|
|
46
|
+
|
|
47
|
+
event_matching: bool = True
|
|
48
|
+
"""Whether update events in the alignment"""
|
|
49
|
+
|
|
50
|
+
extra_events: List[str] = field(default_factory=list)
|
|
51
|
+
"""Additional event types to always detect, even if not in top_k.
|
|
52
|
+
Example: ["Applause", "Laughter", "Music"]
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
event_aliases: Dict[str, List[str]] = field(default_factory=dict)
|
|
56
|
+
"""Custom aliases mapping [Event] markers to AED labels.
|
|
57
|
+
|
|
58
|
+
Core AED labels (14 types):
|
|
59
|
+
[Applause], [Baby cry], [Battle cry], [Bellow], [Children shouting],
|
|
60
|
+
[Laughter], [Music], [Shout], [Singing], [Sound effect],
|
|
61
|
+
[Speech], [Whoop], [Yell]
|
|
62
|
+
|
|
63
|
+
Custom aliases extend built-ins (not replace):
|
|
64
|
+
{"[Audience reaction]": ["[Applause]", "[Cheering]"]}
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
time_tolerance: float = 20.0
|
|
68
|
+
"""Max time (seconds) non-Speech events can extend beyond supervision boundaries."""
|
|
69
|
+
|
|
70
|
+
update_timestamps: bool = True
|
|
71
|
+
"""Whether to update caption event timestamps based on AED detections."""
|
|
72
|
+
|
|
73
|
+
duplicate_strategy: Literal["keep_all", "merge_first", "split"] = "merge_first"
|
|
74
|
+
"""Strategy for handling multiple [Event] markers mapped to same AED interval.
|
|
75
|
+
- keep_all: Update all events to same time range (may cause overlapping)
|
|
76
|
+
- merge_first: Keep only first event per interval, skip duplicates
|
|
77
|
+
- split: Split interval at speech boundaries (not yet implemented)
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
client_wrapper: Optional["SyncAPIClient"] = field(default=None, repr=False)
|
|
81
|
+
"""Reference to the SyncAPIClient instance. Auto-set during client initialization."""
|
|
82
|
+
|
|
83
|
+
def __post_init__(self):
|
|
84
|
+
"""Validate and auto-populate configuration after initialization."""
|
|
85
|
+
# Validate device
|
|
86
|
+
if self.device not in ("cpu", "cuda", "mps", "auto") and not self.device.startswith("cuda:"):
|
|
87
|
+
raise ValueError(f"device must be one of ('cpu', 'cuda', 'mps', 'auto'), got '{self.device}'")
|
|
88
|
+
|
|
89
|
+
if self.device == "auto":
|
|
90
|
+
self.device = _select_device(self.device)
|
|
91
|
+
|
|
92
|
+
# Validate vad_chunk_size
|
|
93
|
+
if self.vad_chunk_size < 0:
|
|
94
|
+
raise ValueError("vad_chunk_size must be non-negative")
|
|
95
|
+
|
|
96
|
+
# Validate vad_max_gap
|
|
97
|
+
if self.vad_max_gap < 0:
|
|
98
|
+
raise ValueError("vad_max_gap must be non-negative")
|
|
99
|
+
|
|
100
|
+
# Validate time_tolerance
|
|
101
|
+
if self.time_tolerance < 0:
|
|
102
|
+
raise ValueError("time_tolerance must be non-negative")
|
lattifai/config/media.py
CHANGED
|
@@ -91,6 +91,26 @@ class MediaConfig:
|
|
|
91
91
|
force_overwrite: bool = False
|
|
92
92
|
"""Overwrite existing output files without prompting."""
|
|
93
93
|
|
|
94
|
+
audio_track_id: Optional[str] = "original"
|
|
95
|
+
"""Audio track ID for multi-language YouTube videos.
|
|
96
|
+
- "original": Select the original audio track (default)
|
|
97
|
+
- Language code (e.g., "en", "ja", "fr"): Select by language
|
|
98
|
+
- Format ID (e.g., "251-drc", "140-0"): Select specific format
|
|
99
|
+
- None: No filtering, use yt-dlp default selection
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
quality: str = "best"
|
|
103
|
+
"""Media quality for YouTube downloads.
|
|
104
|
+
For audio:
|
|
105
|
+
- "best": Highest bitrate (default)
|
|
106
|
+
- "medium": ~128 kbps
|
|
107
|
+
- "low": ~50 kbps
|
|
108
|
+
- Numeric string (e.g., "128"): Target bitrate in kbps
|
|
109
|
+
For video:
|
|
110
|
+
- "best": Highest resolution (default)
|
|
111
|
+
- "1080", "720", "480", "360": Target resolution
|
|
112
|
+
"""
|
|
113
|
+
|
|
94
114
|
def __post_init__(self) -> None:
|
|
95
115
|
"""Validate configuration and normalize paths/formats."""
|
|
96
116
|
self._setup_output_directory()
|
lattifai/config/transcription.py
CHANGED
|
@@ -7,7 +7,7 @@ from typing import TYPE_CHECKING, Literal, Optional
|
|
|
7
7
|
from ..utils import _select_device
|
|
8
8
|
|
|
9
9
|
if TYPE_CHECKING:
|
|
10
|
-
from ..
|
|
10
|
+
from ..client import SyncAPIClient
|
|
11
11
|
|
|
12
12
|
SUPPORTED_TRANSCRIPTION_MODELS = Literal[
|
|
13
13
|
"gemini-2.5-pro",
|
|
@@ -48,6 +48,30 @@ class TranscriptionConfig:
|
|
|
48
48
|
language: Optional[str] = None
|
|
49
49
|
"""Target language code for transcription (e.g., 'en', 'zh', 'ja')."""
|
|
50
50
|
|
|
51
|
+
prompt: Optional[str] = None
|
|
52
|
+
"""Custom prompt text or path to prompt file for transcription.
|
|
53
|
+
If the value is an existing file path, the file contents will be used.
|
|
54
|
+
Otherwise, the value is used directly as the prompt text."""
|
|
55
|
+
|
|
56
|
+
description: Optional[str] = None
|
|
57
|
+
"""Media description from platforms like YouTube, Xiaoyuzhou (小宇宙), etc.
|
|
58
|
+
Used to provide context for transcription."""
|
|
59
|
+
|
|
60
|
+
thinking: bool = True
|
|
61
|
+
"""Enable Gemini's thinking mode (Gemini models only). Set to False to disable thinking."""
|
|
62
|
+
|
|
63
|
+
include_thoughts: bool = False
|
|
64
|
+
"""Include Gemini's thinking process in the output (Gemini models only). Requires thinking=True."""
|
|
65
|
+
|
|
66
|
+
temperature: Optional[float] = None
|
|
67
|
+
"""Sampling temperature for generation. Higher values increase randomness."""
|
|
68
|
+
|
|
69
|
+
top_k: Optional[float] = None
|
|
70
|
+
"""Top-k sampling parameter. Limits token selection to top k candidates."""
|
|
71
|
+
|
|
72
|
+
top_p: Optional[float] = None
|
|
73
|
+
"""Nucleus sampling parameter. Limits token selection by cumulative probability."""
|
|
74
|
+
|
|
51
75
|
lattice_model_path: Optional[str] = None
|
|
52
76
|
"""Path to local LattifAI model. Will be auto-set in LattifAI client."""
|
|
53
77
|
|
lattifai/data/caption.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
"""Extended Caption class with transcription, alignment, and diarization support."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeVar
|
|
7
|
+
|
|
8
|
+
from lattifai.caption import Caption as BaseCaption
|
|
9
|
+
from lattifai.caption import Pathlike, Supervision
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from lattifai_core.event import LEDOutput
|
|
13
|
+
|
|
14
|
+
DiarizationOutput = TypeVar("DiarizationOutput")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class Caption(BaseCaption):
|
|
19
|
+
"""
|
|
20
|
+
Extended Caption with transcription, alignment, and diarization support.
|
|
21
|
+
|
|
22
|
+
Inherits from BaseCaption and adds fields for:
|
|
23
|
+
- alignments: Post-alignment results
|
|
24
|
+
- transcription: ASR results
|
|
25
|
+
- event: LattifAI Event Detection results (LEDOutput)
|
|
26
|
+
- diarization: Speaker diarization results
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
These fields are used in the LattifAI pipeline for:
|
|
30
|
+
- Forced alignment results
|
|
31
|
+
- Storing intermediate transcription results
|
|
32
|
+
- LattifAI Event Detection (music, applause, speech, etc.)
|
|
33
|
+
- Speaker identification and separation
|
|
34
|
+
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
# Alignment results
|
|
38
|
+
alignments: List[Supervision] = field(default_factory=list)
|
|
39
|
+
|
|
40
|
+
# Transcription results
|
|
41
|
+
transcription: List[Supervision] = field(default_factory=list)
|
|
42
|
+
|
|
43
|
+
# LattifAI Event Detection results
|
|
44
|
+
event: Optional["LEDOutput"] = None
|
|
45
|
+
|
|
46
|
+
# Speaker Diarization results
|
|
47
|
+
diarization: Optional[DiarizationOutput] = None
|
|
48
|
+
|
|
49
|
+
def __len__(self) -> int:
|
|
50
|
+
"""Return the number of supervision segments."""
|
|
51
|
+
return len(self.supervisions or self.transcription)
|
|
52
|
+
|
|
53
|
+
def __repr__(self) -> str:
|
|
54
|
+
"""String representation of Caption."""
|
|
55
|
+
lang = f"lang={self.language}" if self.language else "lang=unknown"
|
|
56
|
+
kind_str = f"kind={self.kind}" if self.kind else ""
|
|
57
|
+
parts = [f"Caption({len(self.supervisions or self.transcription)} segments", lang]
|
|
58
|
+
if kind_str:
|
|
59
|
+
parts.append(kind_str)
|
|
60
|
+
if self.duration:
|
|
61
|
+
parts.append(f"duration={self.duration:.2f}s")
|
|
62
|
+
return ", ".join(parts) + ")"
|
|
63
|
+
|
|
64
|
+
def with_margins(
|
|
65
|
+
self,
|
|
66
|
+
start_margin: float = 0.08,
|
|
67
|
+
end_margin: float = 0.20,
|
|
68
|
+
min_gap: float = 0.08,
|
|
69
|
+
collision_mode: str = "trim",
|
|
70
|
+
) -> "Caption":
|
|
71
|
+
"""
|
|
72
|
+
Create a new Caption with segment boundaries adjusted based on word-level alignment.
|
|
73
|
+
|
|
74
|
+
Uses supervision.alignment['word'] to recalculate segment start/end times
|
|
75
|
+
with the specified margins applied around the actual speech boundaries.
|
|
76
|
+
|
|
77
|
+
Prefers alignments > supervisions > transcription as source.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
start_margin: Seconds to extend before the first word (default: 0.08)
|
|
81
|
+
end_margin: Seconds to extend after the last word (default: 0.20)
|
|
82
|
+
min_gap: Minimum gap between segments for collision handling (default: 0.08)
|
|
83
|
+
collision_mode: How to handle segment overlap - 'trim' or 'gap' (default: 'trim')
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
New Caption instance with adjusted timestamps
|
|
87
|
+
|
|
88
|
+
Note:
|
|
89
|
+
Segments without alignment data will keep their original timestamps.
|
|
90
|
+
"""
|
|
91
|
+
from lattifai.caption.standardize import apply_margins_to_captions
|
|
92
|
+
|
|
93
|
+
# Determine which supervisions to use (priority: alignments > supervisions > transcription)
|
|
94
|
+
if self.alignments:
|
|
95
|
+
source_sups = self.alignments
|
|
96
|
+
elif self.supervisions:
|
|
97
|
+
source_sups = self.supervisions
|
|
98
|
+
else:
|
|
99
|
+
source_sups = self.transcription
|
|
100
|
+
|
|
101
|
+
adjusted_sups = apply_margins_to_captions(
|
|
102
|
+
source_sups,
|
|
103
|
+
start_margin=start_margin,
|
|
104
|
+
end_margin=end_margin,
|
|
105
|
+
min_gap=min_gap,
|
|
106
|
+
collision_mode=collision_mode,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
return Caption(
|
|
110
|
+
supervisions=adjusted_sups,
|
|
111
|
+
transcription=self.transcription,
|
|
112
|
+
event=self.event,
|
|
113
|
+
diarization=self.diarization,
|
|
114
|
+
alignments=[], # Clear alignments since we've applied them
|
|
115
|
+
language=self.language,
|
|
116
|
+
kind=self.kind,
|
|
117
|
+
source_format=self.source_format,
|
|
118
|
+
source_path=self.source_path,
|
|
119
|
+
metadata=self.metadata.copy() if self.metadata else {},
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
def write(
|
|
123
|
+
self,
|
|
124
|
+
path=None,
|
|
125
|
+
output_format: Optional[str] = None,
|
|
126
|
+
include_speaker_in_text: bool = True,
|
|
127
|
+
word_level: bool = False,
|
|
128
|
+
karaoke_config=None,
|
|
129
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
130
|
+
):
|
|
131
|
+
"""
|
|
132
|
+
Write caption to file or return as bytes.
|
|
133
|
+
|
|
134
|
+
Prefers alignments > supervisions > transcription as source.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
path: Path to output caption file, BytesIO object, or None to return bytes
|
|
138
|
+
output_format: Output format (e.g., 'srt', 'vtt', 'ass')
|
|
139
|
+
include_speaker_in_text: Whether to include speaker labels in text
|
|
140
|
+
word_level: Use word-level output format if supported
|
|
141
|
+
karaoke_config: Karaoke configuration
|
|
142
|
+
metadata: Optional metadata dict to pass to writer
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Path to the written file if path is a file path, or bytes if path is BytesIO/None
|
|
146
|
+
"""
|
|
147
|
+
# Temporarily swap supervisions to use the priority order
|
|
148
|
+
original_supervisions = self.supervisions
|
|
149
|
+
|
|
150
|
+
if self.alignments:
|
|
151
|
+
self.supervisions = self.alignments
|
|
152
|
+
elif not self.supervisions and self.transcription:
|
|
153
|
+
self.supervisions = self.transcription
|
|
154
|
+
|
|
155
|
+
try:
|
|
156
|
+
result = super().write(
|
|
157
|
+
path=path,
|
|
158
|
+
output_format=output_format,
|
|
159
|
+
include_speaker_in_text=include_speaker_in_text,
|
|
160
|
+
word_level=word_level,
|
|
161
|
+
karaoke_config=karaoke_config,
|
|
162
|
+
metadata=metadata,
|
|
163
|
+
)
|
|
164
|
+
finally:
|
|
165
|
+
# Restore original supervisions
|
|
166
|
+
self.supervisions = original_supervisions
|
|
167
|
+
|
|
168
|
+
return result
|
|
169
|
+
|
|
170
|
+
@classmethod
|
|
171
|
+
def from_transcription_results(
|
|
172
|
+
cls,
|
|
173
|
+
transcription: List[Supervision],
|
|
174
|
+
event: Optional["LEDOutput"] = None,
|
|
175
|
+
diarization: Optional[DiarizationOutput] = None,
|
|
176
|
+
language: Optional[str] = None,
|
|
177
|
+
source_path: Optional[Pathlike] = None,
|
|
178
|
+
metadata: Optional[Dict[str, str]] = None,
|
|
179
|
+
) -> "Caption":
|
|
180
|
+
"""
|
|
181
|
+
Create Caption from transcription results including audio events and diarization.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
transcription: List of transcription supervision segments
|
|
185
|
+
event: Optional LEDOutput with event detection results
|
|
186
|
+
diarization: Optional DiarizationOutput with speaker diarization results
|
|
187
|
+
language: Language code
|
|
188
|
+
source_path: Source file path
|
|
189
|
+
metadata: Additional metadata
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
New Caption instance with transcription data
|
|
193
|
+
"""
|
|
194
|
+
return cls(
|
|
195
|
+
transcription=transcription,
|
|
196
|
+
event=event,
|
|
197
|
+
diarization=diarization,
|
|
198
|
+
language=language,
|
|
199
|
+
kind="transcription",
|
|
200
|
+
source_format="asr",
|
|
201
|
+
source_path=source_path,
|
|
202
|
+
metadata=metadata or {},
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
def read_diarization(
|
|
206
|
+
self,
|
|
207
|
+
path: Pathlike,
|
|
208
|
+
) -> "DiarizationOutput":
|
|
209
|
+
"""
|
|
210
|
+
Read speaker diarization TextGrid from file.
|
|
211
|
+
"""
|
|
212
|
+
from lattifai_core.diarization import DiarizationOutput
|
|
213
|
+
|
|
214
|
+
self.diarization = DiarizationOutput.read(path)
|
|
215
|
+
return self.diarization
|
|
216
|
+
|
|
217
|
+
def write_diarization(
|
|
218
|
+
self,
|
|
219
|
+
path: Pathlike,
|
|
220
|
+
) -> Pathlike:
|
|
221
|
+
"""
|
|
222
|
+
Write speaker diarization TextGrid to file.
|
|
223
|
+
"""
|
|
224
|
+
if not self.diarization:
|
|
225
|
+
raise ValueError("No speaker diarization data to write.")
|
|
226
|
+
|
|
227
|
+
self.diarization.write(path)
|
|
228
|
+
return path
|
lattifai/diarization/__init__.py
CHANGED
|
@@ -1,4 +1,44 @@
|
|
|
1
|
-
"""Speaker diarization module for LattifAI.
|
|
1
|
+
"""Speaker diarization module for LattifAI.
|
|
2
|
+
|
|
3
|
+
This module provides multi-speaker identification and labeling capabilities
|
|
4
|
+
using pyannote.audio-based diarization models. It can identify who spoke
|
|
5
|
+
when in an audio file and optionally match detected speakers with existing
|
|
6
|
+
speaker labels from input captions.
|
|
7
|
+
|
|
8
|
+
Key Components:
|
|
9
|
+
LattifAIDiarizer: Main diarization class that wraps pyannote.audio
|
|
10
|
+
pipelines for speaker segmentation and clustering.
|
|
11
|
+
|
|
12
|
+
Features:
|
|
13
|
+
- Automatic speaker detection with configurable min/max speaker counts
|
|
14
|
+
- Speaker label preservation from input captions (e.g., "Alice:", ">> Bob:")
|
|
15
|
+
- Integration with alignment results to assign speakers to words/segments
|
|
16
|
+
- Support for pre-computed diarization results (avoid reprocessing)
|
|
17
|
+
|
|
18
|
+
Configuration:
|
|
19
|
+
Use DiarizationConfig to control:
|
|
20
|
+
- enabled: Whether to run diarization
|
|
21
|
+
- min_speakers/max_speakers: Constrain speaker count detection
|
|
22
|
+
- device: GPU/CPU device selection
|
|
23
|
+
- debug: Enable verbose output
|
|
24
|
+
|
|
25
|
+
Example:
|
|
26
|
+
>>> from lattifai import LattifAI
|
|
27
|
+
>>> from lattifai.config import DiarizationConfig
|
|
28
|
+
>>> client = LattifAI(diarization_config=DiarizationConfig(enabled=True))
|
|
29
|
+
>>> caption = client.alignment(audio="speech.wav", input_caption="transcript.srt")
|
|
30
|
+
>>> for seg in caption.supervisions:
|
|
31
|
+
... print(f"{seg.speaker}: {seg.text}")
|
|
32
|
+
|
|
33
|
+
Performance Notes:
|
|
34
|
+
- Diarization adds ~10-30% processing time to alignment
|
|
35
|
+
- GPU acceleration recommended for longer audio files
|
|
36
|
+
- Results are cached when output_path is provided
|
|
37
|
+
|
|
38
|
+
See Also:
|
|
39
|
+
- lattifai.config.DiarizationConfig: Configuration options
|
|
40
|
+
- lattifai.client.LattifAI.speaker_diarization: Direct diarization method
|
|
41
|
+
"""
|
|
2
42
|
|
|
3
43
|
from .lattifai import LattifAIDiarizer
|
|
4
44
|
|