atom-audio-engine 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- asr/__init__.py +45 -0
- asr/base.py +89 -0
- asr/cartesia.py +356 -0
- asr/deepgram.py +196 -0
- atom_audio_engine-0.1.0.dist-info/METADATA +247 -0
- atom_audio_engine-0.1.0.dist-info/RECORD +25 -0
- atom_audio_engine-0.1.0.dist-info/WHEEL +5 -0
- atom_audio_engine-0.1.0.dist-info/top_level.txt +8 -0
- core/__init__.py +13 -0
- core/config.py +162 -0
- core/pipeline.py +282 -0
- core/types.py +87 -0
- integrations/__init__.py +5 -0
- integrations/geneface.py +297 -0
- llm/__init__.py +38 -0
- llm/base.py +108 -0
- llm/groq.py +210 -0
- pipelines/__init__.py +1 -0
- streaming/__init__.py +5 -0
- streaming/websocket_server.py +341 -0
- tts/__init__.py +37 -0
- tts/base.py +155 -0
- tts/cartesia.py +392 -0
- utils/__init__.py +15 -0
- utils/audio.py +220 -0
utils/audio.py
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
"""Audio utility functions."""
|
|
2
|
+
|
|
3
|
+
import struct
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def resample_audio(
|
|
8
|
+
audio: bytes,
|
|
9
|
+
from_rate: int,
|
|
10
|
+
to_rate: int,
|
|
11
|
+
channels: int = 1,
|
|
12
|
+
sample_width: int = 2,
|
|
13
|
+
) -> bytes:
|
|
14
|
+
"""
|
|
15
|
+
Resample audio to a different sample rate.
|
|
16
|
+
|
|
17
|
+
Uses linear interpolation for simple resampling.
|
|
18
|
+
For higher quality, consider using librosa or scipy.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
audio: Input audio bytes (PCM format)
|
|
22
|
+
from_rate: Original sample rate
|
|
23
|
+
to_rate: Target sample rate
|
|
24
|
+
channels: Number of audio channels
|
|
25
|
+
sample_width: Bytes per sample (2 for 16-bit)
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Resampled audio bytes
|
|
29
|
+
"""
|
|
30
|
+
if from_rate == to_rate:
|
|
31
|
+
return audio
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
import numpy as np
|
|
35
|
+
from scipy import signal
|
|
36
|
+
|
|
37
|
+
# Convert bytes to numpy array
|
|
38
|
+
dtype = np.int16 if sample_width == 2 else np.int32
|
|
39
|
+
samples = np.frombuffer(audio, dtype=dtype)
|
|
40
|
+
|
|
41
|
+
# Resample using scipy
|
|
42
|
+
num_samples = int(len(samples) * to_rate / from_rate)
|
|
43
|
+
resampled = signal.resample(samples, num_samples)
|
|
44
|
+
|
|
45
|
+
return resampled.astype(dtype).tobytes()
|
|
46
|
+
|
|
47
|
+
except ImportError:
|
|
48
|
+
# Fallback to simple linear interpolation
|
|
49
|
+
return _simple_resample(audio, from_rate, to_rate, sample_width)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _simple_resample(
|
|
53
|
+
audio: bytes,
|
|
54
|
+
from_rate: int,
|
|
55
|
+
to_rate: int,
|
|
56
|
+
sample_width: int = 2,
|
|
57
|
+
) -> bytes:
|
|
58
|
+
"""Simple linear interpolation resampling."""
|
|
59
|
+
if sample_width == 2:
|
|
60
|
+
fmt = "<h"
|
|
61
|
+
samples = [
|
|
62
|
+
struct.unpack(fmt, audio[i : i + 2])[0] for i in range(0, len(audio), 2)
|
|
63
|
+
]
|
|
64
|
+
else:
|
|
65
|
+
raise ValueError(f"Unsupported sample width: {sample_width}")
|
|
66
|
+
|
|
67
|
+
ratio = from_rate / to_rate
|
|
68
|
+
new_length = int(len(samples) / ratio)
|
|
69
|
+
resampled = []
|
|
70
|
+
|
|
71
|
+
for i in range(new_length):
|
|
72
|
+
pos = i * ratio
|
|
73
|
+
idx = int(pos)
|
|
74
|
+
frac = pos - idx
|
|
75
|
+
|
|
76
|
+
if idx + 1 < len(samples):
|
|
77
|
+
sample = int(samples[idx] * (1 - frac) + samples[idx + 1] * frac)
|
|
78
|
+
else:
|
|
79
|
+
sample = samples[idx]
|
|
80
|
+
|
|
81
|
+
resampled.append(sample)
|
|
82
|
+
|
|
83
|
+
return struct.pack(f"<{len(resampled)}h", *resampled)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def pcm_to_wav(
|
|
87
|
+
pcm_data: bytes,
|
|
88
|
+
sample_rate: int = 16000,
|
|
89
|
+
channels: int = 1,
|
|
90
|
+
bits_per_sample: int = 16,
|
|
91
|
+
) -> bytes:
|
|
92
|
+
"""
|
|
93
|
+
Convert raw PCM data to WAV format.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
pcm_data: Raw PCM audio bytes
|
|
97
|
+
sample_rate: Sample rate in Hz
|
|
98
|
+
channels: Number of audio channels
|
|
99
|
+
bits_per_sample: Bits per sample (typically 16)
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
WAV file as bytes
|
|
103
|
+
"""
|
|
104
|
+
byte_rate = sample_rate * channels * bits_per_sample // 8
|
|
105
|
+
block_align = channels * bits_per_sample // 8
|
|
106
|
+
data_size = len(pcm_data)
|
|
107
|
+
|
|
108
|
+
header = struct.pack(
|
|
109
|
+
"<4sI4s4sIHHIIHH4sI",
|
|
110
|
+
b"RIFF",
|
|
111
|
+
36 + data_size,
|
|
112
|
+
b"WAVE",
|
|
113
|
+
b"fmt ",
|
|
114
|
+
16, # fmt chunk size
|
|
115
|
+
1, # audio format (PCM)
|
|
116
|
+
channels,
|
|
117
|
+
sample_rate,
|
|
118
|
+
byte_rate,
|
|
119
|
+
block_align,
|
|
120
|
+
bits_per_sample,
|
|
121
|
+
b"data",
|
|
122
|
+
data_size,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
return header + pcm_data
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def wav_to_pcm(wav_data: bytes) -> tuple[bytes, int, int, int]:
|
|
129
|
+
"""
|
|
130
|
+
Extract raw PCM data from WAV format.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
wav_data: WAV file as bytes
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
Tuple of (pcm_data, sample_rate, channels, bits_per_sample)
|
|
137
|
+
"""
|
|
138
|
+
# Parse RIFF header
|
|
139
|
+
if wav_data[:4] != b"RIFF" or wav_data[8:12] != b"WAVE":
|
|
140
|
+
raise ValueError("Invalid WAV file")
|
|
141
|
+
|
|
142
|
+
# Find fmt chunk
|
|
143
|
+
pos = 12
|
|
144
|
+
sample_rate = 0
|
|
145
|
+
channels = 0
|
|
146
|
+
bits_per_sample = 0
|
|
147
|
+
|
|
148
|
+
while pos < len(wav_data):
|
|
149
|
+
chunk_id = wav_data[pos : pos + 4]
|
|
150
|
+
chunk_size = struct.unpack("<I", wav_data[pos + 4 : pos + 8])[0]
|
|
151
|
+
|
|
152
|
+
if chunk_id == b"fmt ":
|
|
153
|
+
_, channels, sample_rate, _, _, bits_per_sample = struct.unpack(
|
|
154
|
+
"<HHIIHH", wav_data[pos + 8 : pos + 24]
|
|
155
|
+
)
|
|
156
|
+
elif chunk_id == b"data":
|
|
157
|
+
pcm_data = wav_data[pos + 8 : pos + 8 + chunk_size]
|
|
158
|
+
return pcm_data, sample_rate, channels, bits_per_sample
|
|
159
|
+
|
|
160
|
+
pos += 8 + chunk_size
|
|
161
|
+
|
|
162
|
+
raise ValueError("No data chunk found in WAV file")
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def get_audio_duration(
|
|
166
|
+
audio: bytes,
|
|
167
|
+
sample_rate: int,
|
|
168
|
+
channels: int = 1,
|
|
169
|
+
bits_per_sample: int = 16,
|
|
170
|
+
) -> float:
|
|
171
|
+
"""
|
|
172
|
+
Calculate duration of PCM audio in seconds.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
audio: PCM audio bytes
|
|
176
|
+
sample_rate: Sample rate in Hz
|
|
177
|
+
channels: Number of audio channels
|
|
178
|
+
bits_per_sample: Bits per sample
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
Duration in seconds
|
|
182
|
+
"""
|
|
183
|
+
bytes_per_sample = bits_per_sample // 8
|
|
184
|
+
total_samples = len(audio) // (bytes_per_sample * channels)
|
|
185
|
+
return total_samples / sample_rate
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def normalize_audio(audio: bytes, target_db: float = -20.0) -> bytes:
|
|
189
|
+
"""
|
|
190
|
+
Normalize audio to a target dB level.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
audio: PCM audio bytes (16-bit)
|
|
194
|
+
target_db: Target dB level
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
Normalized audio bytes
|
|
198
|
+
"""
|
|
199
|
+
try:
|
|
200
|
+
import numpy as np
|
|
201
|
+
|
|
202
|
+
samples = np.frombuffer(audio, dtype=np.int16).astype(np.float32)
|
|
203
|
+
|
|
204
|
+
# Calculate current RMS
|
|
205
|
+
rms = np.sqrt(np.mean(samples**2))
|
|
206
|
+
if rms == 0:
|
|
207
|
+
return audio
|
|
208
|
+
|
|
209
|
+
# Calculate target RMS
|
|
210
|
+
target_rms = 32768 * (10 ** (target_db / 20))
|
|
211
|
+
|
|
212
|
+
# Scale
|
|
213
|
+
gain = target_rms / rms
|
|
214
|
+
normalized = np.clip(samples * gain, -32768, 32767).astype(np.int16)
|
|
215
|
+
|
|
216
|
+
return normalized.tobytes()
|
|
217
|
+
|
|
218
|
+
except ImportError:
|
|
219
|
+
# Return unchanged if numpy not available
|
|
220
|
+
return audio
|