lattifai 1.0.4__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/__init__.py +10 -0
- lattifai/alignment/lattice1_aligner.py +64 -15
- lattifai/alignment/lattice1_worker.py +135 -50
- lattifai/alignment/segmenter.py +3 -2
- lattifai/alignment/tokenizer.py +14 -13
- lattifai/audio2.py +269 -70
- lattifai/caption/caption.py +213 -19
- lattifai/cli/__init__.py +2 -0
- lattifai/cli/alignment.py +2 -1
- lattifai/cli/app_installer.py +35 -33
- lattifai/cli/caption.py +9 -19
- lattifai/cli/diarization.py +108 -0
- lattifai/cli/server.py +3 -1
- lattifai/cli/transcribe.py +55 -38
- lattifai/cli/youtube.py +1 -0
- lattifai/client.py +42 -121
- lattifai/config/alignment.py +37 -2
- lattifai/config/caption.py +1 -1
- lattifai/config/media.py +23 -3
- lattifai/config/transcription.py +4 -0
- lattifai/diarization/lattifai.py +18 -7
- lattifai/errors.py +7 -3
- lattifai/mixin.py +45 -16
- lattifai/server/app.py +2 -1
- lattifai/transcription/__init__.py +1 -1
- lattifai/transcription/base.py +21 -2
- lattifai/transcription/gemini.py +127 -1
- lattifai/transcription/lattifai.py +30 -2
- lattifai/utils.py +96 -28
- lattifai/workflow/file_manager.py +15 -13
- lattifai/workflow/youtube.py +16 -1
- {lattifai-1.0.4.dist-info → lattifai-1.1.0.dist-info}/METADATA +86 -22
- lattifai-1.1.0.dist-info/RECORD +57 -0
- {lattifai-1.0.4.dist-info → lattifai-1.1.0.dist-info}/entry_points.txt +2 -0
- {lattifai-1.0.4.dist-info → lattifai-1.1.0.dist-info}/licenses/LICENSE +1 -1
- lattifai-1.0.4.dist-info/RECORD +0 -56
- {lattifai-1.0.4.dist-info → lattifai-1.1.0.dist-info}/WHEEL +0 -0
- {lattifai-1.0.4.dist-info → lattifai-1.1.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""Speaker diarization CLI entry point with nemo_run."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
import colorful
|
|
7
|
+
import nemo_run as run
|
|
8
|
+
from typing_extensions import Annotated
|
|
9
|
+
|
|
10
|
+
from lattifai.client import LattifAI
|
|
11
|
+
from lattifai.config import CaptionConfig, ClientConfig, DiarizationConfig, MediaConfig
|
|
12
|
+
from lattifai.utils import safe_print
|
|
13
|
+
|
|
14
|
+
__all__ = ["diarize"]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@run.cli.entrypoint(name="run", namespace="diarization")
|
|
18
|
+
def diarize(
|
|
19
|
+
input_media: Optional[str] = None,
|
|
20
|
+
input_caption: Optional[str] = None,
|
|
21
|
+
output_caption: Optional[str] = None,
|
|
22
|
+
media: Annotated[Optional[MediaConfig], run.Config[MediaConfig]] = None,
|
|
23
|
+
caption: Annotated[Optional[CaptionConfig], run.Config[CaptionConfig]] = None,
|
|
24
|
+
client: Annotated[Optional[ClientConfig], run.Config[ClientConfig]] = None,
|
|
25
|
+
diarization: Annotated[Optional[DiarizationConfig], run.Config[DiarizationConfig]] = None,
|
|
26
|
+
):
|
|
27
|
+
"""Run speaker diarization on aligned captions and audio."""
|
|
28
|
+
|
|
29
|
+
media_config = media or MediaConfig()
|
|
30
|
+
caption_config = caption or CaptionConfig()
|
|
31
|
+
diarization_config = diarization or DiarizationConfig()
|
|
32
|
+
|
|
33
|
+
if input_media and media_config.input_path:
|
|
34
|
+
raise ValueError("Cannot specify both positional input_media and media.input_path.")
|
|
35
|
+
if input_media:
|
|
36
|
+
media_config.set_input_path(input_media)
|
|
37
|
+
if not media_config.input_path:
|
|
38
|
+
raise ValueError("Input media path must be provided via positional input_media or media.input_path.")
|
|
39
|
+
|
|
40
|
+
if input_caption and caption_config.input_path:
|
|
41
|
+
raise ValueError("Cannot specify both positional input_caption and caption.input_path.")
|
|
42
|
+
if input_caption:
|
|
43
|
+
caption_config.set_input_path(input_caption)
|
|
44
|
+
if not caption_config.input_path:
|
|
45
|
+
raise ValueError("Input caption path must be provided via positional input_caption or caption.input_path.")
|
|
46
|
+
|
|
47
|
+
if output_caption and caption_config.output_path:
|
|
48
|
+
raise ValueError("Cannot specify both positional output_caption and caption.output_path.")
|
|
49
|
+
if output_caption:
|
|
50
|
+
caption_config.set_output_path(output_caption)
|
|
51
|
+
|
|
52
|
+
diarization_config.enabled = True
|
|
53
|
+
|
|
54
|
+
client_instance = LattifAI(
|
|
55
|
+
client_config=client,
|
|
56
|
+
caption_config=caption_config,
|
|
57
|
+
diarization_config=diarization_config,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
safe_print(colorful.cyan("🎧 Loading media for diarization..."))
|
|
61
|
+
media_audio = client_instance.audio_loader(
|
|
62
|
+
media_config.input_path,
|
|
63
|
+
channel_selector=media_config.channel_selector,
|
|
64
|
+
streaming_chunk_secs=media_config.streaming_chunk_secs,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
safe_print(colorful.cyan("📖 Loading caption segments..."))
|
|
68
|
+
caption_obj = client_instance._read_caption(
|
|
69
|
+
caption_config.input_path,
|
|
70
|
+
input_caption_format=None if caption_config.input_format == "auto" else caption_config.input_format,
|
|
71
|
+
verbose=False,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
if not caption_obj.alignments:
|
|
75
|
+
caption_obj.alignments = caption_obj.supervisions
|
|
76
|
+
|
|
77
|
+
if not caption_obj.alignments:
|
|
78
|
+
raise ValueError("Caption does not contain segments for diarization.")
|
|
79
|
+
|
|
80
|
+
if caption_config.output_path:
|
|
81
|
+
output_path = caption_config.output_path
|
|
82
|
+
else:
|
|
83
|
+
from datetime import datetime
|
|
84
|
+
|
|
85
|
+
input_caption_path = Path(caption_config.input_path)
|
|
86
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H")
|
|
87
|
+
default_output = (
|
|
88
|
+
input_caption_path.parent / f"{input_caption_path.stem}.diarized.{timestamp}.{caption_config.output_format}"
|
|
89
|
+
)
|
|
90
|
+
caption_config.set_output_path(default_output)
|
|
91
|
+
output_path = caption_config.output_path
|
|
92
|
+
|
|
93
|
+
safe_print(colorful.cyan("🗣️ Performing speaker diarization..."))
|
|
94
|
+
diarized_caption = client_instance.speaker_diarization(
|
|
95
|
+
input_media=media_audio,
|
|
96
|
+
caption=caption_obj,
|
|
97
|
+
output_caption_path=output_path,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
return diarized_caption
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def main():
|
|
104
|
+
run.cli.main(diarize)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
if __name__ == "__main__":
|
|
108
|
+
main()
|
lattifai/cli/server.py
CHANGED
|
@@ -4,6 +4,8 @@ import os
|
|
|
4
4
|
import colorful
|
|
5
5
|
import uvicorn
|
|
6
6
|
|
|
7
|
+
from lattifai.utils import safe_print
|
|
8
|
+
|
|
7
9
|
|
|
8
10
|
def main():
|
|
9
11
|
"""Launch the LattifAI Web Interface."""
|
|
@@ -29,7 +31,7 @@ def main():
|
|
|
29
31
|
|
|
30
32
|
args = parser.parse_args()
|
|
31
33
|
|
|
32
|
-
|
|
34
|
+
safe_print(colorful.bold_green("🚀 Launching LattifAI Backend Server..."))
|
|
33
35
|
print(colorful.cyan(f"Server running at http://localhost:{args.port}"))
|
|
34
36
|
print(colorful.yellow(f"Host: {args.host}"))
|
|
35
37
|
print(colorful.yellow(f"Auto-reload: {'disabled' if args.no_reload else 'enabled'}"))
|
lattifai/cli/transcribe.py
CHANGED
|
@@ -3,10 +3,8 @@
|
|
|
3
3
|
from typing import Optional
|
|
4
4
|
|
|
5
5
|
import nemo_run as run
|
|
6
|
-
from lhotse.utils import Pathlike
|
|
7
6
|
from typing_extensions import Annotated
|
|
8
7
|
|
|
9
|
-
from lattifai.audio2 import AudioLoader, ChannelSelectorType
|
|
10
8
|
from lattifai.cli.alignment import align as alignment_align
|
|
11
9
|
from lattifai.config import (
|
|
12
10
|
AlignmentConfig,
|
|
@@ -23,9 +21,8 @@ from lattifai.utils import _resolve_model_path
|
|
|
23
21
|
def transcribe(
|
|
24
22
|
input: Optional[str] = None,
|
|
25
23
|
output_caption: Optional[str] = None,
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
channel_selector: Optional[ChannelSelectorType] = "average",
|
|
24
|
+
media: Annotated[Optional[MediaConfig], run.Config[MediaConfig]] = None,
|
|
25
|
+
client: Annotated[Optional[ClientConfig], run.Config[ClientConfig]] = None,
|
|
29
26
|
transcription: Annotated[Optional[TranscriptionConfig], run.Config[TranscriptionConfig]] = None,
|
|
30
27
|
):
|
|
31
28
|
"""
|
|
@@ -39,11 +36,8 @@ def transcribe(
|
|
|
39
36
|
Args:
|
|
40
37
|
input: Path to input audio/video file or YouTube URL (can be provided as positional argument)
|
|
41
38
|
output_caption: Path for output caption file (can be provided as positional argument)
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
channel_selector: Audio channel selection strategy (default: average)
|
|
45
|
-
Options: average, left, right, or an integer channel index.
|
|
46
|
-
Note: Ignored when input is a URL and Gemini transcriber is used.
|
|
39
|
+
media: Media configuration for input/output handling.
|
|
40
|
+
Fields: input_path, output_dir, media_format, channel_selector, streaming_chunk_secs
|
|
47
41
|
transcription: Transcription service configuration.
|
|
48
42
|
Fields: model_name, device, language, gemini_api_key
|
|
49
43
|
|
|
@@ -67,6 +61,11 @@ def transcribe(
|
|
|
67
61
|
lai transcribe run audio.wav output.srt \\
|
|
68
62
|
transcription.language=zh
|
|
69
63
|
|
|
64
|
+
# With MediaConfig settings
|
|
65
|
+
lai transcribe run audio.wav output.srt \\
|
|
66
|
+
media.channel_selector=left \\
|
|
67
|
+
media.streaming_chunk_secs=30.0
|
|
68
|
+
|
|
70
69
|
# Full configuration with keyword arguments
|
|
71
70
|
lai transcribe run \\
|
|
72
71
|
input=audio.wav \\
|
|
@@ -78,68 +77,86 @@ def transcribe(
|
|
|
78
77
|
from pathlib import Path
|
|
79
78
|
|
|
80
79
|
import colorful
|
|
80
|
+
from lattifai_core.client import SyncAPIClient
|
|
81
81
|
|
|
82
|
+
from lattifai.audio2 import AudioLoader
|
|
82
83
|
from lattifai.transcription import create_transcriber
|
|
84
|
+
from lattifai.utils import safe_print
|
|
83
85
|
|
|
84
|
-
# Initialize
|
|
86
|
+
# Initialize configs with defaults
|
|
87
|
+
client_config = client or ClientConfig()
|
|
85
88
|
transcription_config = transcription or TranscriptionConfig()
|
|
89
|
+
media_config = media or MediaConfig()
|
|
90
|
+
|
|
91
|
+
# Initialize client wrapper to properly set client_wrapper
|
|
92
|
+
client_wrapper = SyncAPIClient(config=client_config)
|
|
93
|
+
transcription_config.client_wrapper = client_wrapper
|
|
94
|
+
|
|
95
|
+
# Initialize client wrapper to properly set client_wrapper
|
|
96
|
+
client_wrapper = SyncAPIClient(config=client_config)
|
|
97
|
+
transcription_config.client_wrapper = client_wrapper
|
|
86
98
|
|
|
87
99
|
# Validate input is required
|
|
88
|
-
if not input:
|
|
89
|
-
raise ValueError("Input is required. Provide input as positional argument
|
|
100
|
+
if not input and not media_config.input_path:
|
|
101
|
+
raise ValueError("Input is required. Provide input as positional argument or media.input_path.")
|
|
102
|
+
|
|
103
|
+
# Assign input to media_config if provided
|
|
104
|
+
if input:
|
|
105
|
+
media_config.set_input_path(input)
|
|
90
106
|
|
|
91
107
|
# Detect if input is a URL
|
|
92
|
-
is_url =
|
|
108
|
+
is_url = media_config.is_input_remote()
|
|
93
109
|
|
|
94
110
|
# Prepare output paths
|
|
95
111
|
if is_url:
|
|
96
|
-
# For URLs, use output_dir
|
|
97
|
-
|
|
98
|
-
output_path = Path(str(output_dir)).expanduser()
|
|
99
|
-
output_path.mkdir(parents=True, exist_ok=True)
|
|
100
|
-
else:
|
|
101
|
-
output_path = Path.cwd()
|
|
112
|
+
# For URLs, use output_dir from media_config or current directory
|
|
113
|
+
output_path = media_config.output_dir
|
|
102
114
|
else:
|
|
103
115
|
# For files, use input path directory
|
|
104
|
-
|
|
105
|
-
output_path = input_path.parent
|
|
116
|
+
output_path = Path(media_config.input_path).parent
|
|
106
117
|
|
|
107
118
|
# Create transcriber
|
|
108
119
|
if not transcription_config.lattice_model_path:
|
|
109
|
-
transcription_config.lattice_model_path = _resolve_model_path(
|
|
120
|
+
transcription_config.lattice_model_path = _resolve_model_path(
|
|
121
|
+
"LattifAI/Lattice-1", getattr(transcription_config, "model_hub", "huggingface")
|
|
122
|
+
)
|
|
110
123
|
transcriber = create_transcriber(transcription_config=transcription_config)
|
|
111
124
|
|
|
112
|
-
|
|
113
|
-
|
|
125
|
+
safe_print(colorful.cyan(f"🎤 Starting transcription with {transcriber.name}..."))
|
|
126
|
+
safe_print(colorful.cyan(f" Input: {media_config.input_path}"))
|
|
114
127
|
|
|
115
128
|
# Perform transcription
|
|
116
129
|
if is_url and transcriber.supports_url:
|
|
117
130
|
# Check if transcriber supports URL directly
|
|
118
|
-
|
|
119
|
-
transcript = asyncio.run(transcriber.transcribe(
|
|
131
|
+
safe_print(colorful.cyan(" Transcribing from URL directly..."))
|
|
132
|
+
transcript = asyncio.run(transcriber.transcribe(media_config.input_path))
|
|
120
133
|
else:
|
|
121
134
|
if is_url:
|
|
122
135
|
# Download media first, then transcribe
|
|
123
|
-
|
|
136
|
+
safe_print(colorful.cyan(" Downloading media from URL..."))
|
|
124
137
|
from lattifai.workflow.youtube import YouTubeDownloader
|
|
125
138
|
|
|
126
139
|
downloader = YouTubeDownloader()
|
|
127
140
|
input_path = asyncio.run(
|
|
128
141
|
downloader.download_media(
|
|
129
|
-
url=
|
|
142
|
+
url=media_config.input_path,
|
|
130
143
|
output_dir=str(output_path),
|
|
131
|
-
media_format=
|
|
132
|
-
force_overwrite=
|
|
144
|
+
media_format=media_config.normalize_format(),
|
|
145
|
+
force_overwrite=media_config.force_overwrite,
|
|
133
146
|
)
|
|
134
147
|
)
|
|
135
|
-
|
|
148
|
+
safe_print(colorful.cyan(f" Media downloaded to: {input_path}"))
|
|
136
149
|
else:
|
|
137
|
-
input_path = Path(
|
|
150
|
+
input_path = Path(media_config.input_path)
|
|
138
151
|
|
|
139
|
-
|
|
152
|
+
safe_print(colorful.cyan(" Loading audio..."))
|
|
140
153
|
# For files, load audio first
|
|
141
154
|
audio_loader = AudioLoader(device=transcription_config.device)
|
|
142
|
-
media_audio = audio_loader(
|
|
155
|
+
media_audio = audio_loader(
|
|
156
|
+
input_path,
|
|
157
|
+
channel_selector=media_config.channel_selector,
|
|
158
|
+
streaming_chunk_secs=media_config.streaming_chunk_secs,
|
|
159
|
+
)
|
|
143
160
|
transcript = asyncio.run(transcriber.transcribe(media_audio))
|
|
144
161
|
|
|
145
162
|
# Determine output caption path
|
|
@@ -153,14 +170,14 @@ def transcribe(
|
|
|
153
170
|
final_output = output_path / f"youtube_LattifAI_{transcriber.name}.{output_format}"
|
|
154
171
|
else:
|
|
155
172
|
# For files, use input filename with suffix
|
|
156
|
-
final_output = Path(
|
|
173
|
+
final_output = Path(media_config.input_path).with_suffix(".LattifAI.srt")
|
|
157
174
|
|
|
158
|
-
|
|
175
|
+
safe_print(colorful.cyan(f" Output: {final_output}"))
|
|
159
176
|
|
|
160
177
|
# Write output
|
|
161
178
|
transcriber.write(transcript, final_output, encoding="utf-8", cache_audio_events=False)
|
|
162
179
|
|
|
163
|
-
|
|
180
|
+
safe_print(colorful.green(f"🎉 Transcription completed: {final_output}"))
|
|
164
181
|
|
|
165
182
|
return transcript
|
|
166
183
|
|
lattifai/cli/youtube.py
CHANGED
lattifai/client.py
CHANGED
|
@@ -18,6 +18,7 @@ from lattifai.errors import (
|
|
|
18
18
|
LatticeEncodingError,
|
|
19
19
|
)
|
|
20
20
|
from lattifai.mixin import LattifAIClientMixin
|
|
21
|
+
from lattifai.utils import safe_print
|
|
21
22
|
|
|
22
23
|
if TYPE_CHECKING:
|
|
23
24
|
from lattifai.diarization import LattifAIDiarizer # noqa: F401
|
|
@@ -91,6 +92,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
91
92
|
input_caption_format: Optional[InputCaptionFormat] = None,
|
|
92
93
|
split_sentence: Optional[bool] = None,
|
|
93
94
|
channel_selector: Optional[str | int] = "average",
|
|
95
|
+
streaming_chunk_secs: Optional[float] = None,
|
|
94
96
|
) -> Caption:
|
|
95
97
|
try:
|
|
96
98
|
# Step 1: Get caption
|
|
@@ -100,10 +102,17 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
100
102
|
media_audio = self.audio_loader(
|
|
101
103
|
input_media,
|
|
102
104
|
channel_selector=channel_selector,
|
|
105
|
+
streaming_chunk_secs=streaming_chunk_secs,
|
|
103
106
|
)
|
|
104
107
|
|
|
105
108
|
if not input_caption:
|
|
106
|
-
|
|
109
|
+
output_dir = None
|
|
110
|
+
if output_caption_path:
|
|
111
|
+
output_dir = Path(str(output_caption_path)).parent
|
|
112
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
113
|
+
caption = self._transcribe(
|
|
114
|
+
media_audio, source_lang=self.caption_config.source_lang, is_async=False, output_dir=output_dir
|
|
115
|
+
)
|
|
107
116
|
else:
|
|
108
117
|
caption = self._read_caption(input_caption, input_caption_format)
|
|
109
118
|
|
|
@@ -113,7 +122,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
113
122
|
alignment_strategy = self.aligner.config.strategy
|
|
114
123
|
|
|
115
124
|
if alignment_strategy != "entire" or caption.transcription:
|
|
116
|
-
|
|
125
|
+
safe_print(colorful.cyan(f"🔄 Using segmented alignment strategy: {alignment_strategy}"))
|
|
117
126
|
|
|
118
127
|
if caption.supervisions and alignment_strategy == "transcription":
|
|
119
128
|
# raise NotImplementedError("Transcription-based alignment is not yet implemented.")
|
|
@@ -126,7 +135,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
126
135
|
if not caption.transcription:
|
|
127
136
|
import asyncio
|
|
128
137
|
|
|
129
|
-
|
|
138
|
+
safe_print(colorful.cyan("📝 Transcribing media for alignment..."))
|
|
130
139
|
if output_caption_path:
|
|
131
140
|
transcript_file = (
|
|
132
141
|
Path(str(output_caption_path)).parent
|
|
@@ -223,11 +232,11 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
223
232
|
continue
|
|
224
233
|
|
|
225
234
|
offset = round(start, 4)
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
)
|
|
235
|
+
# Extract audio slice
|
|
236
|
+
audio_slice_ndarray = media_audio.ndarray[
|
|
237
|
+
:, int(start * media_audio.sampling_rate) : int(end * media_audio.sampling_rate)
|
|
238
|
+
]
|
|
239
|
+
emission = self.aligner.emission(audio_slice_ndarray)
|
|
231
240
|
|
|
232
241
|
# Align segment
|
|
233
242
|
_supervisions, _alignments = self.aligner.alignment(
|
|
@@ -257,18 +266,9 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
257
266
|
caption.supervisions = supervisions
|
|
258
267
|
caption.alignments = alignments
|
|
259
268
|
|
|
260
|
-
|
|
261
|
-
if self.diarization_config.enabled and self.diarizer:
|
|
262
|
-
print(colorful.cyan("🗣️ Performing speaker diarization..."))
|
|
263
|
-
caption = self.speaker_diarization(
|
|
264
|
-
input_media=media_audio,
|
|
265
|
-
caption=caption,
|
|
266
|
-
output_caption_path=output_caption_path,
|
|
267
|
-
)
|
|
268
|
-
elif output_caption_path:
|
|
269
|
+
if output_caption_path:
|
|
269
270
|
self._write_caption(caption, output_caption_path)
|
|
270
271
|
|
|
271
|
-
return caption
|
|
272
272
|
except (CaptionProcessingError, LatticeEncodingError, AlignmentError, LatticeDecodingError):
|
|
273
273
|
# Re-raise our specific errors as-is
|
|
274
274
|
raise
|
|
@@ -281,6 +281,17 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
281
281
|
context={"original_error": str(e), "error_type": e.__class__.__name__},
|
|
282
282
|
)
|
|
283
283
|
|
|
284
|
+
# Step 5: Speaker diarization
|
|
285
|
+
if self.diarization_config.enabled and self.diarizer:
|
|
286
|
+
safe_print(colorful.cyan("🗣️ Performing speaker diarization..."))
|
|
287
|
+
caption = self.speaker_diarization(
|
|
288
|
+
input_media=media_audio,
|
|
289
|
+
caption=caption,
|
|
290
|
+
output_caption_path=output_caption_path,
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
return caption
|
|
294
|
+
|
|
284
295
|
def speaker_diarization(
|
|
285
296
|
self,
|
|
286
297
|
input_media: AudioData,
|
|
@@ -308,11 +319,18 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
308
319
|
if output_caption_path:
|
|
309
320
|
diarization_file = Path(str(output_caption_path)).with_suffix(".SpkDiar")
|
|
310
321
|
if diarization_file.exists():
|
|
311
|
-
|
|
322
|
+
safe_print(colorful.cyan(f"Reading existing speaker diarization from {diarization_file}"))
|
|
312
323
|
caption.read_speaker_diarization(diarization_file)
|
|
313
324
|
|
|
314
325
|
diarization, alignments = self.diarizer.diarize_with_alignments(
|
|
315
|
-
input_media,
|
|
326
|
+
input_media,
|
|
327
|
+
caption.alignments,
|
|
328
|
+
diarization=caption.speaker_diarization,
|
|
329
|
+
alignment_fn=self.aligner.alignment,
|
|
330
|
+
transcribe_fn=self.transcriber.transcribe_numpy if self.transcriber else None,
|
|
331
|
+
separate_fn=self.aligner.separate if self.aligner.worker.separator_ort else None,
|
|
332
|
+
debug=self.diarizer.config.debug,
|
|
333
|
+
output_path=output_caption_path,
|
|
316
334
|
)
|
|
317
335
|
caption.alignments = alignments
|
|
318
336
|
caption.speaker_diarization = diarization
|
|
@@ -321,105 +339,6 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
321
339
|
if output_caption_path:
|
|
322
340
|
self._write_caption(caption, output_caption_path)
|
|
323
341
|
|
|
324
|
-
if self.diarizer.config.debug:
|
|
325
|
-
# debug
|
|
326
|
-
from tgt import Interval, IntervalTier, TextGrid, write_to_file
|
|
327
|
-
|
|
328
|
-
debug_tg = TextGrid()
|
|
329
|
-
transcript_tier = IntervalTier(
|
|
330
|
-
start_time=0,
|
|
331
|
-
end_time=input_media.duration,
|
|
332
|
-
name="transcript",
|
|
333
|
-
objects=[Interval(sup.start, sup.end, sup.text) for sup in caption.alignments],
|
|
334
|
-
)
|
|
335
|
-
debug_tg.add_tier(transcript_tier)
|
|
336
|
-
|
|
337
|
-
speaker_tier = IntervalTier(
|
|
338
|
-
start_time=0,
|
|
339
|
-
end_time=input_media.duration,
|
|
340
|
-
name="speaker",
|
|
341
|
-
objects=[Interval(sup.start, sup.end, sup.speaker) for sup in caption.alignments],
|
|
342
|
-
)
|
|
343
|
-
debug_tg.add_tier(speaker_tier)
|
|
344
|
-
|
|
345
|
-
from collections import defaultdict
|
|
346
|
-
|
|
347
|
-
spk2intervals = defaultdict(lambda: [])
|
|
348
|
-
num_multispk = 0
|
|
349
|
-
|
|
350
|
-
segments, skipks = [], []
|
|
351
|
-
for k, supervision in enumerate(caption.alignments): # TODO: alignments 本身存在 overlap, eg: [event]
|
|
352
|
-
# supervision = caption.alignments[k]
|
|
353
|
-
if supervision.custom.get("speaker", []):
|
|
354
|
-
num_multispk += 1
|
|
355
|
-
else:
|
|
356
|
-
continue
|
|
357
|
-
|
|
358
|
-
if k in skipks:
|
|
359
|
-
continue
|
|
360
|
-
|
|
361
|
-
for speaker in supervision.custom.get("speaker", []):
|
|
362
|
-
for name, start_time, end_time in speaker:
|
|
363
|
-
spk2intervals[name].append(Interval(start_time, end_time, name))
|
|
364
|
-
|
|
365
|
-
_segments = []
|
|
366
|
-
if k > 0:
|
|
367
|
-
_segments.append(caption.alignments[k - 1])
|
|
368
|
-
_segments.append(supervision)
|
|
369
|
-
while k + 1 < len(caption.alignments):
|
|
370
|
-
skipks.append(k + 1)
|
|
371
|
-
next_sup = caption.alignments[k + 1]
|
|
372
|
-
if not next_sup.custom.get("speaker", []):
|
|
373
|
-
k += 1
|
|
374
|
-
break
|
|
375
|
-
_segments.append(next_sup)
|
|
376
|
-
k += 1
|
|
377
|
-
|
|
378
|
-
if segments:
|
|
379
|
-
if _segments[0].start >= segments[-1][-1].end:
|
|
380
|
-
segments.append(_segments)
|
|
381
|
-
else:
|
|
382
|
-
if _segments[1:]:
|
|
383
|
-
segments.append(_segments[1:])
|
|
384
|
-
else:
|
|
385
|
-
pass
|
|
386
|
-
else:
|
|
387
|
-
segments.append(_segments)
|
|
388
|
-
|
|
389
|
-
print(
|
|
390
|
-
f"Number of multi-speaker segments: {num_multispk}/{len(caption.alignments)} segments: {len(segments)}"
|
|
391
|
-
)
|
|
392
|
-
|
|
393
|
-
for speaker, intervals in sorted(spk2intervals.items(), key=lambda x: x[0]):
|
|
394
|
-
speaker_tier = IntervalTier(
|
|
395
|
-
start_time=0, end_time=input_media.duration, name=speaker, objects=intervals
|
|
396
|
-
)
|
|
397
|
-
debug_tg.add_tier(speaker_tier)
|
|
398
|
-
|
|
399
|
-
for tier in caption.speaker_diarization.tiers:
|
|
400
|
-
tier.name = f"Diarization-{tier.name}"
|
|
401
|
-
debug_tg.add_tier(tier)
|
|
402
|
-
|
|
403
|
-
tier = IntervalTier(
|
|
404
|
-
start_time=0,
|
|
405
|
-
end_time=input_media.duration,
|
|
406
|
-
name="resegment",
|
|
407
|
-
objects=[
|
|
408
|
-
Interval(round(sup.start, 2), round(sup.end, 2), sup.text)
|
|
409
|
-
for _segments in segments
|
|
410
|
-
for sup in _segments
|
|
411
|
-
],
|
|
412
|
-
)
|
|
413
|
-
debug_tg.add_tier(tier)
|
|
414
|
-
|
|
415
|
-
# if caption.audio_events:
|
|
416
|
-
# for tier in caption.audio_events.tiers:
|
|
417
|
-
# # tier.name = f"{tier.name}"
|
|
418
|
-
# debug_tg.add_tier(tier)
|
|
419
|
-
|
|
420
|
-
debug_tgt_file = Path(str(output_caption_path)).with_suffix(".DiarizationDebug.TextGrid")
|
|
421
|
-
write_to_file(debug_tg, debug_tgt_file, format="long")
|
|
422
|
-
|
|
423
342
|
return caption
|
|
424
343
|
|
|
425
344
|
def youtube(
|
|
@@ -433,12 +352,13 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
433
352
|
split_sentence: Optional[bool] = None,
|
|
434
353
|
use_transcription: bool = False,
|
|
435
354
|
channel_selector: Optional[str | int] = "average",
|
|
355
|
+
streaming_chunk_secs: Optional[float] = None,
|
|
436
356
|
) -> Caption:
|
|
437
357
|
# Prepare output directory and media format
|
|
438
358
|
output_dir = self._prepare_youtube_output_dir(output_dir)
|
|
439
359
|
media_format = self._determine_media_format(media_format)
|
|
440
360
|
|
|
441
|
-
|
|
361
|
+
safe_print(colorful.cyan(f"🎬 Starting YouTube workflow for: {url}"))
|
|
442
362
|
|
|
443
363
|
# Step 1: Download media
|
|
444
364
|
media_file = self._download_media_sync(url, output_dir, media_format, force_overwrite)
|
|
@@ -460,7 +380,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
460
380
|
output_caption_path = self._generate_output_caption_path(output_caption_path, media_file, output_dir)
|
|
461
381
|
|
|
462
382
|
# Step 4: Perform alignment
|
|
463
|
-
|
|
383
|
+
safe_print(colorful.cyan("🔗 Performing forced alignment..."))
|
|
464
384
|
|
|
465
385
|
caption: Caption = self.alignment(
|
|
466
386
|
input_media=media_audio,
|
|
@@ -468,6 +388,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
468
388
|
output_caption_path=output_caption_path,
|
|
469
389
|
split_sentence=split_sentence,
|
|
470
390
|
channel_selector=channel_selector,
|
|
391
|
+
streaming_chunk_secs=streaming_chunk_secs,
|
|
471
392
|
)
|
|
472
393
|
|
|
473
394
|
return caption
|
lattifai/config/alignment.py
CHANGED
|
@@ -18,8 +18,11 @@ class AlignmentConfig:
|
|
|
18
18
|
"""
|
|
19
19
|
|
|
20
20
|
# Alignment configuration
|
|
21
|
-
model_name: str = "
|
|
22
|
-
"""Model identifier or path to local model directory (e.g., '
|
|
21
|
+
model_name: str = "LattifAI/Lattice-1"
|
|
22
|
+
"""Model identifier or path to local model directory (e.g., 'LattifAI/Lattice-1')."""
|
|
23
|
+
|
|
24
|
+
model_hub: Literal["huggingface", "modelscope"] = "huggingface"
|
|
25
|
+
"""Which model hub to use when resolving remote model names: 'huggingface' or 'modelscope'."""
|
|
23
26
|
|
|
24
27
|
device: Literal["cpu", "cuda", "mps", "auto"] = "auto"
|
|
25
28
|
"""Computation device: 'cpu' for CPU, 'cuda' for NVIDIA GPU, 'mps' for Apple Silicon."""
|
|
@@ -58,6 +61,38 @@ class AlignmentConfig:
|
|
|
58
61
|
Default: 4.0 seconds. Useful for detecting scene changes or natural breaks in content.
|
|
59
62
|
"""
|
|
60
63
|
|
|
64
|
+
# Beam search parameters for forced alignment
|
|
65
|
+
search_beam: int = 200
|
|
66
|
+
"""Search beam size for beam search decoding. Larger values explore more hypotheses but are slower.
|
|
67
|
+
Default: 200. Typical range: 20-500.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
output_beam: int = 80
|
|
71
|
+
"""Output beam size for keeping top hypotheses. Should be smaller than search_beam.
|
|
72
|
+
Default: 80. Typical range: 10-200.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
min_active_states: int = 400
|
|
76
|
+
"""Minimum number of active states during decoding. Controls memory and search space.
|
|
77
|
+
Default: 400. Typical range: 30-1000.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
max_active_states: int = 10000
|
|
81
|
+
"""Maximum number of active states during decoding. Prevents excessive memory usage.
|
|
82
|
+
Default: 10000. Typical range: 1000-20000.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
# Alignment timing configuration
|
|
86
|
+
start_margin: float = 0.08
|
|
87
|
+
"""Maximum start time margin (in seconds) to extend segment boundaries at the beginning.
|
|
88
|
+
Default: 0.08. Typical range: 0.0-0.5.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
end_margin: float = 0.20
|
|
92
|
+
"""Maximum end time margin (in seconds) to extend segment boundaries at the end.
|
|
93
|
+
Default: 0.20. Typical range: 0.0-0.5.
|
|
94
|
+
"""
|
|
95
|
+
|
|
61
96
|
client_wrapper: Optional["SyncAPIClient"] = field(default=None, repr=False)
|
|
62
97
|
"""Reference to the SyncAPIClient instance. Auto-set during client initialization."""
|
|
63
98
|
|
lattifai/config/caption.py
CHANGED
|
@@ -48,7 +48,7 @@ class CaptionConfig:
|
|
|
48
48
|
include_speaker_in_text: bool = True
|
|
49
49
|
"""Preserve speaker labels in caption text content."""
|
|
50
50
|
|
|
51
|
-
normalize_text: bool =
|
|
51
|
+
normalize_text: bool = True
|
|
52
52
|
"""Clean HTML entities and normalize whitespace in caption text."""
|
|
53
53
|
|
|
54
54
|
split_sentence: bool = False
|