lattifai 0.4.5__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/__init__.py +61 -47
- lattifai/alignment/__init__.py +6 -0
- lattifai/alignment/lattice1_aligner.py +119 -0
- lattifai/alignment/lattice1_worker.py +185 -0
- lattifai/{tokenizer → alignment}/phonemizer.py +4 -4
- lattifai/alignment/segmenter.py +166 -0
- lattifai/{tokenizer → alignment}/tokenizer.py +244 -169
- lattifai/audio2.py +211 -0
- lattifai/caption/__init__.py +20 -0
- lattifai/caption/caption.py +1275 -0
- lattifai/{io → caption}/gemini_reader.py +30 -30
- lattifai/{io → caption}/gemini_writer.py +17 -17
- lattifai/{io → caption}/supervision.py +4 -3
- lattifai/caption/text_parser.py +145 -0
- lattifai/cli/__init__.py +17 -0
- lattifai/cli/alignment.py +153 -0
- lattifai/cli/caption.py +204 -0
- lattifai/cli/server.py +19 -0
- lattifai/cli/transcribe.py +197 -0
- lattifai/cli/youtube.py +128 -0
- lattifai/client.py +460 -251
- lattifai/config/__init__.py +20 -0
- lattifai/config/alignment.py +73 -0
- lattifai/config/caption.py +178 -0
- lattifai/config/client.py +46 -0
- lattifai/config/diarization.py +67 -0
- lattifai/config/media.py +335 -0
- lattifai/config/transcription.py +84 -0
- lattifai/diarization/__init__.py +5 -0
- lattifai/diarization/lattifai.py +89 -0
- lattifai/errors.py +98 -91
- lattifai/logging.py +116 -0
- lattifai/mixin.py +552 -0
- lattifai/server/app.py +420 -0
- lattifai/transcription/__init__.py +76 -0
- lattifai/transcription/base.py +108 -0
- lattifai/transcription/gemini.py +219 -0
- lattifai/transcription/lattifai.py +103 -0
- lattifai/{workflows → transcription}/prompts/__init__.py +4 -4
- lattifai/types.py +30 -0
- lattifai/utils.py +16 -44
- lattifai/workflow/__init__.py +22 -0
- lattifai/workflow/agents.py +6 -0
- lattifai/{workflows → workflow}/base.py +22 -22
- lattifai/{workflows → workflow}/file_manager.py +239 -215
- lattifai/workflow/youtube.py +564 -0
- lattifai-1.0.0.dist-info/METADATA +736 -0
- lattifai-1.0.0.dist-info/RECORD +52 -0
- {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/WHEEL +1 -1
- lattifai-1.0.0.dist-info/entry_points.txt +13 -0
- {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/licenses/LICENSE +1 -1
- lattifai/base_client.py +0 -126
- lattifai/bin/__init__.py +0 -3
- lattifai/bin/agent.py +0 -325
- lattifai/bin/align.py +0 -296
- lattifai/bin/cli_base.py +0 -25
- lattifai/bin/subtitle.py +0 -210
- lattifai/io/__init__.py +0 -42
- lattifai/io/reader.py +0 -85
- lattifai/io/text_parser.py +0 -75
- lattifai/io/utils.py +0 -15
- lattifai/io/writer.py +0 -90
- lattifai/tokenizer/__init__.py +0 -3
- lattifai/workers/__init__.py +0 -3
- lattifai/workers/lattice1_alpha.py +0 -284
- lattifai/workflows/__init__.py +0 -34
- lattifai/workflows/agents.py +0 -10
- lattifai/workflows/gemini.py +0 -167
- lattifai/workflows/prompts/README.md +0 -22
- lattifai/workflows/prompts/gemini/README.md +0 -24
- lattifai/workflows/prompts/gemini/transcription_gem.txt +0 -81
- lattifai/workflows/youtube.py +0 -931
- lattifai-0.4.5.dist-info/METADATA +0 -808
- lattifai-0.4.5.dist-info/RECORD +0 -39
- lattifai-0.4.5.dist-info/entry_points.txt +0 -3
- {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/top_level.txt +0 -0
lattifai/audio2.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
"""Audio loading and resampling utilities."""
|
|
2
|
+
|
|
3
|
+
from collections import namedtuple
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import BinaryIO, Iterable, Optional, Tuple, Union
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import soundfile as sf
|
|
9
|
+
import torch
|
|
10
|
+
from lhotse.augmentation import get_or_create_resampler
|
|
11
|
+
from lhotse.utils import Pathlike
|
|
12
|
+
|
|
13
|
+
from lattifai.errors import AudioLoadError
|
|
14
|
+
|
|
15
|
+
# ChannelSelectorType = Union[int, Iterable[int], str]
|
|
16
|
+
ChannelSelectorType = Union[int, str]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class AudioData(namedtuple("AudioData", ["sampling_rate", "ndarray", "tensor", "device", "path"])):
|
|
20
|
+
"""Audio data container with sampling rate, numpy array, tensor, and device information."""
|
|
21
|
+
|
|
22
|
+
def __str__(self) -> str:
|
|
23
|
+
return self.path
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def duration(self) -> float:
|
|
27
|
+
"""Duration of the audio in seconds."""
|
|
28
|
+
return self.ndarray.shape[-1] / self.sampling_rate
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class AudioLoader:
|
|
32
|
+
"""Load and preprocess audio files into AudioData format."""
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
device: str = "cpu",
|
|
37
|
+
):
|
|
38
|
+
"""Initialize AudioLoader.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
device: Device to load audio tensors on (default: "cpu").
|
|
42
|
+
"""
|
|
43
|
+
self.device = device
|
|
44
|
+
self._resampler_cache = {}
|
|
45
|
+
|
|
46
|
+
def _resample_audio(
|
|
47
|
+
self,
|
|
48
|
+
audio_sr: Tuple[torch.Tensor, int],
|
|
49
|
+
sampling_rate: int,
|
|
50
|
+
device: Optional[str],
|
|
51
|
+
channel_selector: Optional[ChannelSelectorType],
|
|
52
|
+
) -> torch.Tensor:
|
|
53
|
+
"""Resample audio to target sampling rate with channel selection.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
audio_sr: Tuple of (audio_tensor, original_sample_rate).
|
|
57
|
+
sampling_rate: Target sampling rate.
|
|
58
|
+
device: Device to perform resampling on.
|
|
59
|
+
channel_selector: How to select channels.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
Resampled audio tensor of shape (1, T) or (C, T).
|
|
63
|
+
"""
|
|
64
|
+
audio, sr = audio_sr
|
|
65
|
+
|
|
66
|
+
if channel_selector is None:
|
|
67
|
+
# keep the original multi-channel signal
|
|
68
|
+
tensor = audio
|
|
69
|
+
elif isinstance(channel_selector, int):
|
|
70
|
+
assert audio.shape[0] >= channel_selector, f"Invalid channel: {channel_selector}"
|
|
71
|
+
tensor = audio[channel_selector : channel_selector + 1].clone()
|
|
72
|
+
del audio
|
|
73
|
+
elif isinstance(channel_selector, str):
|
|
74
|
+
assert channel_selector == "average"
|
|
75
|
+
tensor = torch.mean(audio.to(device), dim=0, keepdim=True)
|
|
76
|
+
del audio
|
|
77
|
+
else:
|
|
78
|
+
raise ValueError(f"Unsupported channel_selector: {channel_selector}")
|
|
79
|
+
# assert isinstance(channel_selector, Iterable)
|
|
80
|
+
# num_channels = audio.shape[0]
|
|
81
|
+
# print(f"Selecting channels {channel_selector} from the signal with {num_channels} channels.")
|
|
82
|
+
# if max(channel_selector) >= num_channels:
|
|
83
|
+
# raise ValueError(
|
|
84
|
+
# f"Cannot select channel subset {channel_selector} from a signal with {num_channels} channels."
|
|
85
|
+
# )
|
|
86
|
+
# tensor = audio[channel_selector]
|
|
87
|
+
|
|
88
|
+
tensor = tensor.to(device)
|
|
89
|
+
if sr != sampling_rate:
|
|
90
|
+
cache_key = (sr, sampling_rate, device)
|
|
91
|
+
if cache_key not in self._resampler_cache:
|
|
92
|
+
self._resampler_cache[cache_key] = get_or_create_resampler(sr, sampling_rate).to(device=device)
|
|
93
|
+
resampler = self._resampler_cache[cache_key]
|
|
94
|
+
|
|
95
|
+
length = tensor.size(-1)
|
|
96
|
+
chunk_size = sampling_rate * 3600
|
|
97
|
+
if length > chunk_size:
|
|
98
|
+
resampled_chunks = []
|
|
99
|
+
for i in range(0, length, chunk_size):
|
|
100
|
+
resampled_chunks.append(resampler(tensor[..., i : i + chunk_size]))
|
|
101
|
+
tensor = torch.cat(resampled_chunks, dim=-1)
|
|
102
|
+
else:
|
|
103
|
+
tensor = resampler(tensor)
|
|
104
|
+
|
|
105
|
+
return tensor
|
|
106
|
+
|
|
107
|
+
def _load_audio(
|
|
108
|
+
self,
|
|
109
|
+
audio: Union[Pathlike, BinaryIO],
|
|
110
|
+
sampling_rate: int,
|
|
111
|
+
channel_selector: Optional[ChannelSelectorType],
|
|
112
|
+
) -> torch.Tensor:
|
|
113
|
+
"""Load audio from file or binary stream and resample to target rate.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
audio: Path to audio file or binary stream.
|
|
117
|
+
sampling_rate: Target sampling rate.
|
|
118
|
+
channel_selector: How to select channels.
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
Resampled audio tensor.
|
|
122
|
+
|
|
123
|
+
Raises:
|
|
124
|
+
ImportError: If PyAV is needed but not installed.
|
|
125
|
+
ValueError: If no audio stream found.
|
|
126
|
+
RuntimeError: If audio loading fails.
|
|
127
|
+
"""
|
|
128
|
+
if isinstance(audio, Pathlike):
|
|
129
|
+
audio = str(Path(str(audio)).expanduser())
|
|
130
|
+
|
|
131
|
+
# load audio
|
|
132
|
+
try:
|
|
133
|
+
waveform, sample_rate = sf.read(audio, always_2d=True, dtype="float32") # numpy array
|
|
134
|
+
waveform = waveform.T # (channels, samples)
|
|
135
|
+
except Exception as primary_error:
|
|
136
|
+
# Fallback to PyAV for formats not supported by soundfile
|
|
137
|
+
try:
|
|
138
|
+
import av
|
|
139
|
+
except ImportError:
|
|
140
|
+
raise AudioLoadError(
|
|
141
|
+
"PyAV (av) is required for loading certain audio formats. "
|
|
142
|
+
f"Install it with: pip install av\n"
|
|
143
|
+
f"Primary error was: {primary_error}"
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
try:
|
|
147
|
+
container = av.open(audio)
|
|
148
|
+
audio_stream = next((s for s in container.streams if s.type == "audio"), None)
|
|
149
|
+
|
|
150
|
+
if audio_stream is None:
|
|
151
|
+
raise ValueError(f"No audio stream found in file: {audio}")
|
|
152
|
+
|
|
153
|
+
# Resample to target sample rate during decoding
|
|
154
|
+
audio_stream.codec_context.format = av.AudioFormat("flt") # 32-bit float
|
|
155
|
+
|
|
156
|
+
frames = []
|
|
157
|
+
for frame in container.decode(audio_stream):
|
|
158
|
+
# Convert frame to numpy array
|
|
159
|
+
array = frame.to_ndarray()
|
|
160
|
+
# Ensure shape is (channels, samples)
|
|
161
|
+
if array.ndim == 1:
|
|
162
|
+
array = array.reshape(1, -1)
|
|
163
|
+
elif array.ndim == 2 and array.shape[0] > array.shape[1]:
|
|
164
|
+
array = array.T
|
|
165
|
+
frames.append(array)
|
|
166
|
+
|
|
167
|
+
container.close()
|
|
168
|
+
|
|
169
|
+
if not frames:
|
|
170
|
+
raise ValueError(f"No audio data found in file: {audio}")
|
|
171
|
+
|
|
172
|
+
# Concatenate all frames
|
|
173
|
+
waveform = np.concatenate(frames, axis=1).astype(np.float32) # (channels, samples)
|
|
174
|
+
sample_rate = audio_stream.codec_context.sample_rate
|
|
175
|
+
except Exception as e:
|
|
176
|
+
raise RuntimeError(f"Failed to load audio file {audio}: {e}")
|
|
177
|
+
|
|
178
|
+
return self._resample_audio(
|
|
179
|
+
(torch.from_numpy(waveform), sample_rate),
|
|
180
|
+
sampling_rate,
|
|
181
|
+
device=self.device,
|
|
182
|
+
channel_selector=channel_selector,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
def __call__(
|
|
186
|
+
self,
|
|
187
|
+
audio: Union[Pathlike, BinaryIO],
|
|
188
|
+
sampling_rate: int = 16000,
|
|
189
|
+
channel_selector: Optional[ChannelSelectorType] = "average",
|
|
190
|
+
) -> AudioData:
|
|
191
|
+
"""
|
|
192
|
+
Args:
|
|
193
|
+
audio: Path to audio file or binary stream.
|
|
194
|
+
channel_selector: How to select channels (default: "average").
|
|
195
|
+
sampling_rate: Target sampling rate (default: use instance sampling_rate).
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
AudioData namedtuple with sampling_rate, ndarray, and tensor fields.
|
|
199
|
+
"""
|
|
200
|
+
tensor = self._load_audio(audio, sampling_rate, channel_selector)
|
|
201
|
+
|
|
202
|
+
# tensor is (1, T) or (C, T)
|
|
203
|
+
ndarray = tensor.cpu().numpy()
|
|
204
|
+
|
|
205
|
+
return AudioData(
|
|
206
|
+
sampling_rate=sampling_rate,
|
|
207
|
+
ndarray=ndarray,
|
|
208
|
+
tensor=tensor,
|
|
209
|
+
device=self.device,
|
|
210
|
+
path=str(audio) if isinstance(audio, Pathlike) else "<BinaryIO>",
|
|
211
|
+
)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from typing import List, Optional
|
|
2
|
+
|
|
3
|
+
from lhotse.utils import Pathlike
|
|
4
|
+
|
|
5
|
+
from ..config.caption import InputCaptionFormat
|
|
6
|
+
from .caption import Caption
|
|
7
|
+
from .gemini_reader import GeminiReader, GeminiSegment
|
|
8
|
+
from .gemini_writer import GeminiWriter
|
|
9
|
+
from .supervision import Supervision
|
|
10
|
+
from .text_parser import normalize_text
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"Caption",
|
|
14
|
+
"Supervision",
|
|
15
|
+
"GeminiReader",
|
|
16
|
+
"GeminiWriter",
|
|
17
|
+
"GeminiSegment",
|
|
18
|
+
"normalize_text",
|
|
19
|
+
"InputCaptionFormat",
|
|
20
|
+
]
|