neverlib 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- neverlib/__init__.py +2 -2
- neverlib/audio_aug/__init__.py +1 -1
- neverlib/audio_aug/audio_aug.py +4 -5
- neverlib/dataAnalyze/README.md +234 -0
- neverlib/dataAnalyze/__init__.py +87 -0
- neverlib/dataAnalyze/dataset_analyzer.py +590 -0
- neverlib/dataAnalyze/quality_metrics.py +364 -0
- neverlib/dataAnalyze/rms_distrubution.py +62 -0
- neverlib/dataAnalyze/spectral_analysis.py +218 -0
- neverlib/dataAnalyze/statistics.py +406 -0
- neverlib/dataAnalyze/temporal_features.py +126 -0
- neverlib/dataAnalyze/visualization.py +468 -0
- neverlib/filter/AudoEQ/README.md +165 -0
- neverlib/filter/AudoEQ/auto_eq_de.py +361 -0
- neverlib/filter/AudoEQ/auto_eq_ga_advanced.py +577 -0
- neverlib/filter/AudoEQ/auto_eq_ga_basic.py +380 -0
- neverlib/filter/AudoEQ/auto_eq_spectral_direct.py +75 -0
- neverlib/filter/README.md +101 -0
- neverlib/filter/__init__.py +7 -0
- neverlib/filter/biquad.py +45 -0
- neverlib/filter/common.py +5 -6
- neverlib/filter/core.py +339 -0
- neverlib/metrics/dnsmos.py +160 -0
- neverlib/metrics/snr.py +177 -0
- neverlib/metrics/spec.py +45 -0
- neverlib/metrics/test_pesq.py +35 -0
- neverlib/metrics/time.py +68 -0
- neverlib/tests/test_vad.py +21 -0
- neverlib/utils/audio_split.py +5 -3
- neverlib/utils/message.py +4 -4
- neverlib/utils/utils.py +32 -15
- neverlib/vad/PreProcess.py +1 -1
- neverlib/vad/README.md +10 -10
- neverlib/vad/VAD_Energy.py +1 -1
- neverlib/vad/VAD_Silero.py +1 -1
- neverlib/vad/VAD_WebRTC.py +1 -1
- neverlib/vad/VAD_funasr.py +1 -1
- neverlib/vad/VAD_statistics.py +3 -3
- neverlib/vad/VAD_vadlib.py +2 -2
- neverlib/vad/VAD_whisper.py +1 -1
- neverlib/vad/__init__.py +1 -1
- neverlib/vad/class_get_speech.py +4 -4
- neverlib/vad/class_vad.py +1 -1
- neverlib/vad/utils.py +47 -5
- {neverlib-0.2.1.dist-info → neverlib-0.2.3.dist-info}/METADATA +120 -120
- neverlib-0.2.3.dist-info/RECORD +53 -0
- {neverlib-0.2.1.dist-info → neverlib-0.2.3.dist-info}/WHEEL +1 -1
- neverlib/Documents/vad/VAD_Energy.ipynb +0 -159
- neverlib/Documents/vad/VAD_Silero.ipynb +0 -305
- neverlib/Documents/vad/VAD_WebRTC.ipynb +0 -183
- neverlib/Documents/vad/VAD_funasr.ipynb +0 -179
- neverlib/Documents/vad/VAD_ppasr.ipynb +0 -175
- neverlib/Documents/vad/VAD_statistics.ipynb +0 -522
- neverlib/Documents/vad/VAD_vadlib.ipynb +0 -184
- neverlib/Documents/vad/VAD_whisper.ipynb +0 -430
- neverlib/utils/waveform_analyzer.py +0 -51
- neverlib/wav_data/000_short.wav +0 -0
- neverlib-0.2.1.dist-info/RECORD +0 -40
- {neverlib-0.2.1.dist-info → neverlib-0.2.3.dist-info}/licenses/LICENSE +0 -0
- {neverlib-0.2.1.dist-info → neverlib-0.2.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,364 @@
|
|
|
1
|
+
"""
|
|
2
|
+
音频质量评估模块
|
|
3
|
+
Audio Quality Metrics Module
|
|
4
|
+
|
|
5
|
+
提供音频质量评估和失真度分析功能
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import librosa
|
|
10
|
+
from scipy import signal
|
|
11
|
+
from scipy.fft import fft, fftfreq
|
|
12
|
+
from typing import Tuple, Optional, Union, List
|
|
13
|
+
import warnings
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class QualityAnalyzer:
|
|
17
|
+
"""音频质量分析器类"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, sr: int = 22050):
|
|
20
|
+
"""
|
|
21
|
+
初始化质量分析器
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
sr: 采样率
|
|
25
|
+
"""
|
|
26
|
+
self.sr = sr
|
|
27
|
+
|
|
28
|
+
def signal_to_noise_ratio(self, signal_audio: np.ndarray,
|
|
29
|
+
noise_audio: Optional[np.ndarray] = None,
|
|
30
|
+
signal_start: Optional[int] = None,
|
|
31
|
+
signal_end: Optional[int] = None) -> float:
|
|
32
|
+
"""
|
|
33
|
+
计算信噪比 (SNR)
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
signal_audio: 含有信号和噪声的音频
|
|
37
|
+
noise_audio: 纯噪声音频(可选)
|
|
38
|
+
signal_start: 信号开始位置(当噪声未单独提供时使用)
|
|
39
|
+
signal_end: 信号结束位置(当噪声未单独提供时使用)
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
SNR值(dB)
|
|
43
|
+
"""
|
|
44
|
+
if noise_audio is not None:
|
|
45
|
+
# 如果提供了噪声音频
|
|
46
|
+
signal_power = np.mean(signal_audio ** 2)
|
|
47
|
+
noise_power = np.mean(noise_audio ** 2)
|
|
48
|
+
else:
|
|
49
|
+
# 从音频中提取信号和噪声部分
|
|
50
|
+
if signal_start is None or signal_end is None:
|
|
51
|
+
raise ValueError("Must provide signal_start and signal_end when noise_audio is None")
|
|
52
|
+
|
|
53
|
+
signal_part = signal_audio[signal_start:signal_end]
|
|
54
|
+
|
|
55
|
+
# 假设开头和结尾是噪声
|
|
56
|
+
noise_start = signal_audio[:signal_start] if signal_start > 0 else np.array([])
|
|
57
|
+
noise_end = signal_audio[signal_end:] if signal_end < len(signal_audio) else np.array([])
|
|
58
|
+
noise_part = np.concatenate([noise_start, noise_end]) if len(noise_start) > 0 or len(noise_end) > 0 else signal_audio[:1000]
|
|
59
|
+
|
|
60
|
+
signal_power = np.mean(signal_part ** 2)
|
|
61
|
+
noise_power = np.mean(noise_part ** 2)
|
|
62
|
+
|
|
63
|
+
if noise_power == 0:
|
|
64
|
+
return float('inf')
|
|
65
|
+
|
|
66
|
+
snr_db = 10 * np.log10(signal_power / noise_power)
|
|
67
|
+
return snr_db
|
|
68
|
+
|
|
69
|
+
def total_harmonic_distortion(self, audio: np.ndarray,
|
|
70
|
+
fundamental_freq: Optional[float] = None,
|
|
71
|
+
num_harmonics: int = 5) -> float:
|
|
72
|
+
"""
|
|
73
|
+
计算总谐波失真 (THD)
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
audio: 音频信号
|
|
77
|
+
fundamental_freq: 基频(Hz), 如果不提供则自动检测
|
|
78
|
+
num_harmonics: 考虑的谐波数量
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
THD百分比
|
|
82
|
+
"""
|
|
83
|
+
# 计算频谱
|
|
84
|
+
spectrum = fft(audio)
|
|
85
|
+
freqs = fftfreq(len(audio), 1/self.sr)
|
|
86
|
+
magnitude = np.abs(spectrum)
|
|
87
|
+
|
|
88
|
+
# 只考虑正频率
|
|
89
|
+
positive_idx = freqs > 0
|
|
90
|
+
freqs = freqs[positive_idx]
|
|
91
|
+
magnitude = magnitude[positive_idx]
|
|
92
|
+
|
|
93
|
+
# 如果没有提供基频, 自动检测
|
|
94
|
+
if fundamental_freq is None:
|
|
95
|
+
fundamental_freq = freqs[np.argmax(magnitude)]
|
|
96
|
+
|
|
97
|
+
# 找到基频和谐波的功率
|
|
98
|
+
tolerance = fundamental_freq * 0.05 # 5%的容差
|
|
99
|
+
|
|
100
|
+
# 基频功率
|
|
101
|
+
fundamental_idx = np.where(np.abs(freqs - fundamental_freq) < tolerance)[0]
|
|
102
|
+
if len(fundamental_idx) == 0:
|
|
103
|
+
return 0.0
|
|
104
|
+
|
|
105
|
+
fundamental_power = np.max(magnitude[fundamental_idx]) ** 2
|
|
106
|
+
|
|
107
|
+
# 谐波功率
|
|
108
|
+
harmonic_power = 0
|
|
109
|
+
for h in range(2, num_harmonics + 2):
|
|
110
|
+
harmonic_freq = h * fundamental_freq
|
|
111
|
+
harmonic_idx = np.where(np.abs(freqs - harmonic_freq) < tolerance)[0]
|
|
112
|
+
if len(harmonic_idx) > 0:
|
|
113
|
+
harmonic_power += np.max(magnitude[harmonic_idx]) ** 2
|
|
114
|
+
|
|
115
|
+
if fundamental_power == 0:
|
|
116
|
+
return 0.0
|
|
117
|
+
|
|
118
|
+
thd = np.sqrt(harmonic_power / fundamental_power) * 100
|
|
119
|
+
return thd
|
|
120
|
+
|
|
121
|
+
def dynamic_range(self, audio: np.ndarray, percentile_low: float = 1,
|
|
122
|
+
percentile_high: float = 99) -> float:
|
|
123
|
+
"""
|
|
124
|
+
计算动态范围
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
audio: 音频信号
|
|
128
|
+
percentile_low: 低百分位数
|
|
129
|
+
percentile_high: 高百分位数
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
动态范围(dB)
|
|
133
|
+
"""
|
|
134
|
+
amplitude = np.abs(audio)
|
|
135
|
+
amplitude = amplitude[amplitude > 0] # 避免log(0)
|
|
136
|
+
|
|
137
|
+
if len(amplitude) == 0:
|
|
138
|
+
return 0.0
|
|
139
|
+
|
|
140
|
+
low_level = np.percentile(amplitude, percentile_low)
|
|
141
|
+
high_level = np.percentile(amplitude, percentile_high)
|
|
142
|
+
|
|
143
|
+
dynamic_range_db = 20 * np.log10(high_level / (low_level + 1e-10))
|
|
144
|
+
return dynamic_range_db
|
|
145
|
+
|
|
146
|
+
def frequency_response(self, audio: np.ndarray,
|
|
147
|
+
reference_audio: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]:
|
|
148
|
+
"""
|
|
149
|
+
计算频率响应特性
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
audio: 测试音频信号
|
|
153
|
+
reference_audio: 参考音频信号(可选)
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
(频率数组, 幅度响应数组)
|
|
157
|
+
"""
|
|
158
|
+
if reference_audio is not None:
|
|
159
|
+
# 计算传递函数
|
|
160
|
+
freqs, h = signal.freqz_zpk(*signal.tf2zpk([1], [1]), fs=self.sr)
|
|
161
|
+
|
|
162
|
+
# 使用互相关计算频率响应
|
|
163
|
+
cross_corr = signal.correlate(audio, reference_audio, mode='full')
|
|
164
|
+
auto_corr = signal.correlate(reference_audio, reference_audio, mode='full')
|
|
165
|
+
|
|
166
|
+
# 频域除法得到传递函数
|
|
167
|
+
cross_spectrum = fft(cross_corr)
|
|
168
|
+
auto_spectrum = fft(auto_corr)
|
|
169
|
+
|
|
170
|
+
h_measured = cross_spectrum / (auto_spectrum + 1e-10)
|
|
171
|
+
freqs = fftfreq(len(h_measured), 1/self.sr)
|
|
172
|
+
|
|
173
|
+
# 只取正频率部分
|
|
174
|
+
positive_idx = freqs >= 0
|
|
175
|
+
freqs = freqs[positive_idx]
|
|
176
|
+
h_measured = h_measured[positive_idx]
|
|
177
|
+
|
|
178
|
+
return freqs, np.abs(h_measured)
|
|
179
|
+
else:
|
|
180
|
+
# 直接返回频谱
|
|
181
|
+
spectrum = fft(audio)
|
|
182
|
+
freqs = fftfreq(len(audio), 1/self.sr)
|
|
183
|
+
|
|
184
|
+
positive_idx = freqs >= 0
|
|
185
|
+
freqs = freqs[positive_idx]
|
|
186
|
+
spectrum = spectrum[positive_idx]
|
|
187
|
+
|
|
188
|
+
return freqs, np.abs(spectrum)
|
|
189
|
+
|
|
190
|
+
def loudness_range(self, audio: np.ndarray, gate_threshold: float = -70) -> dict:
|
|
191
|
+
"""
|
|
192
|
+
计算响度范围(基于EBU R128标准的简化版本)
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
audio: 音频信号
|
|
196
|
+
gate_threshold: 门限阈值(dB)
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
响度统计信息字典
|
|
200
|
+
"""
|
|
201
|
+
# 分块计算短时响度
|
|
202
|
+
block_size = int(0.4 * self.sr) # 400ms块
|
|
203
|
+
hop_size = int(0.1 * self.sr) # 100ms跳跃
|
|
204
|
+
|
|
205
|
+
blocks = []
|
|
206
|
+
for i in range(0, len(audio) - block_size, hop_size):
|
|
207
|
+
block = audio[i:i + block_size]
|
|
208
|
+
# 简化的响度计算(使用RMS近似)
|
|
209
|
+
rms = np.sqrt(np.mean(block ** 2))
|
|
210
|
+
if rms > 0:
|
|
211
|
+
loudness = 20 * np.log10(rms)
|
|
212
|
+
if loudness > gate_threshold:
|
|
213
|
+
blocks.append(loudness)
|
|
214
|
+
|
|
215
|
+
if len(blocks) == 0:
|
|
216
|
+
return {'integrated_loudness': -float('inf'), 'loudness_range': 0, 'max_loudness': -float('inf')}
|
|
217
|
+
|
|
218
|
+
blocks = np.array(blocks)
|
|
219
|
+
|
|
220
|
+
# 计算统计量
|
|
221
|
+
integrated_loudness = np.mean(blocks)
|
|
222
|
+
loudness_range = np.percentile(blocks, 95) - np.percentile(blocks, 10)
|
|
223
|
+
max_loudness = np.max(blocks)
|
|
224
|
+
|
|
225
|
+
return {
|
|
226
|
+
'integrated_loudness': integrated_loudness,
|
|
227
|
+
'loudness_range': loudness_range,
|
|
228
|
+
'max_loudness': max_loudness
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
def spectral_distortion(self, original: np.ndarray, processed: np.ndarray) -> float:
|
|
232
|
+
"""
|
|
233
|
+
计算谱失真度
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
original: 原始音频
|
|
237
|
+
processed: 处理后音频
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
谱失真度(dB)
|
|
241
|
+
"""
|
|
242
|
+
# 确保两个信号长度相同
|
|
243
|
+
min_len = min(len(original), len(processed))
|
|
244
|
+
original = original[:min_len]
|
|
245
|
+
processed = processed[:min_len]
|
|
246
|
+
|
|
247
|
+
# 计算频谱
|
|
248
|
+
orig_spectrum = np.abs(fft(original))
|
|
249
|
+
proc_spectrum = np.abs(fft(processed))
|
|
250
|
+
|
|
251
|
+
# 计算谱失真
|
|
252
|
+
mse = np.mean((orig_spectrum - proc_spectrum) ** 2)
|
|
253
|
+
orig_power = np.mean(orig_spectrum ** 2)
|
|
254
|
+
|
|
255
|
+
if orig_power == 0:
|
|
256
|
+
return float('inf')
|
|
257
|
+
|
|
258
|
+
distortion_db = 10 * np.log10(mse / orig_power)
|
|
259
|
+
return distortion_db
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def comprehensive_quality_assessment(audio: np.ndarray, sr: int = 22050,
|
|
263
|
+
reference: Optional[np.ndarray] = None) -> dict:
|
|
264
|
+
"""
|
|
265
|
+
综合质量评估
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
audio: 待评估音频
|
|
269
|
+
sr: 采样率
|
|
270
|
+
reference: 参考音频(可选)
|
|
271
|
+
|
|
272
|
+
Returns:
|
|
273
|
+
质量评估结果字典
|
|
274
|
+
"""
|
|
275
|
+
analyzer = QualityAnalyzer(sr=sr)
|
|
276
|
+
|
|
277
|
+
results = {
|
|
278
|
+
'dynamic_range': analyzer.dynamic_range(audio),
|
|
279
|
+
'loudness_stats': analyzer.loudness_range(audio),
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
# 尝试计算THD
|
|
283
|
+
try:
|
|
284
|
+
results['thd'] = analyzer.total_harmonic_distortion(audio)
|
|
285
|
+
except:
|
|
286
|
+
results['thd'] = None
|
|
287
|
+
|
|
288
|
+
# 如果有参考音频, 计算比较指标
|
|
289
|
+
if reference is not None:
|
|
290
|
+
try:
|
|
291
|
+
results['snr'] = analyzer.signal_to_noise_ratio(audio, reference)
|
|
292
|
+
results['spectral_distortion'] = analyzer.spectral_distortion(reference, audio)
|
|
293
|
+
except:
|
|
294
|
+
results['snr'] = None
|
|
295
|
+
results['spectral_distortion'] = None
|
|
296
|
+
|
|
297
|
+
# 频率响应
|
|
298
|
+
try:
|
|
299
|
+
freqs, response = analyzer.frequency_response(audio, reference)
|
|
300
|
+
results['frequency_response'] = {
|
|
301
|
+
'frequencies': freqs,
|
|
302
|
+
'magnitude': response
|
|
303
|
+
}
|
|
304
|
+
except:
|
|
305
|
+
results['frequency_response'] = None
|
|
306
|
+
|
|
307
|
+
return results
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def audio_health_check(audio: np.ndarray, sr: int = 22050) -> dict:
|
|
311
|
+
"""
|
|
312
|
+
音频健康检查
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
audio: 音频信号
|
|
316
|
+
sr: 采样率
|
|
317
|
+
|
|
318
|
+
Returns:
|
|
319
|
+
健康检查结果
|
|
320
|
+
"""
|
|
321
|
+
health_report = {
|
|
322
|
+
'issues': [],
|
|
323
|
+
'warnings': [],
|
|
324
|
+
'stats': {}
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
# 基础统计
|
|
328
|
+
max_amplitude = np.max(np.abs(audio))
|
|
329
|
+
min_amplitude = np.min(np.abs(audio))
|
|
330
|
+
mean_amplitude = np.mean(np.abs(audio))
|
|
331
|
+
|
|
332
|
+
health_report['stats'] = {
|
|
333
|
+
'max_amplitude': max_amplitude,
|
|
334
|
+
'min_amplitude': min_amplitude,
|
|
335
|
+
'mean_amplitude': mean_amplitude,
|
|
336
|
+
'duration': len(audio) / sr
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
# 检查削波
|
|
340
|
+
if max_amplitude >= 0.99:
|
|
341
|
+
health_report['issues'].append('Potential clipping detected')
|
|
342
|
+
|
|
343
|
+
# 检查过低音量
|
|
344
|
+
if max_amplitude < 0.01:
|
|
345
|
+
health_report['warnings'].append('Very low signal level')
|
|
346
|
+
|
|
347
|
+
# 检查静音
|
|
348
|
+
if mean_amplitude < 1e-6:
|
|
349
|
+
health_report['issues'].append('Signal appears to be silent')
|
|
350
|
+
|
|
351
|
+
# 检查DC偏移
|
|
352
|
+
dc_offset = np.mean(audio)
|
|
353
|
+
if abs(dc_offset) > 0.01:
|
|
354
|
+
health_report['warnings'].append(f'DC offset detected: {dc_offset:.4f}')
|
|
355
|
+
|
|
356
|
+
# 检查动态范围
|
|
357
|
+
analyzer = QualityAnalyzer(sr=sr)
|
|
358
|
+
dynamic_range = analyzer.dynamic_range(audio)
|
|
359
|
+
if dynamic_range < 6:
|
|
360
|
+
health_report['warnings'].append('Low dynamic range')
|
|
361
|
+
elif dynamic_range > 60:
|
|
362
|
+
health_report['warnings'].append('Very high dynamic range - check for noise')
|
|
363
|
+
|
|
364
|
+
return health_report
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Author: 凌逆战 | Never
|
|
3
|
+
Date: 2025-03-26 22:13:22
|
|
4
|
+
Description:
|
|
5
|
+
'''
|
|
6
|
+
# -*- coding:utf-8 -*-
|
|
7
|
+
# Author:凌逆战 | Never
|
|
8
|
+
# Date: 2025/3/2
|
|
9
|
+
"""
|
|
10
|
+
统计音频语音段rms值分布
|
|
11
|
+
"""
|
|
12
|
+
import sys
|
|
13
|
+
sys.path.append("../../../")
|
|
14
|
+
import torch
|
|
15
|
+
import soundfile as sf
|
|
16
|
+
from neverlib.utils import get_path_list
|
|
17
|
+
from neverlib.filter import HPFilter
|
|
18
|
+
from neverlib.audio_aug import volume_norm
|
|
19
|
+
from neverlib.dataAnalyze.utils import rms_amplitude
|
|
20
|
+
from joblib import Parallel, delayed
|
|
21
|
+
import matplotlib.pyplot as plt
|
|
22
|
+
import numpy as np
|
|
23
|
+
import librosa
|
|
24
|
+
import os
|
|
25
|
+
from utils.train_utils import from_path_get_vadpoint
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_rms_vad(wav_path):
|
|
31
|
+
wav, wav_sr = sf.read(wav_path, always_2d=True) # (xxx,ch)
|
|
32
|
+
assert wav_sr == sr, f"期望采样率为{sr}, 但是为{wav_sr}, 文件名: {wav_path}"
|
|
33
|
+
vadstart, vadend = from_path_get_vadpoint(wav_path)
|
|
34
|
+
rms = rms_amplitude(wav[vadstart:vadend]).mean()
|
|
35
|
+
# if rms < -75:
|
|
36
|
+
# print(wav_path, np.round(rms, 2))
|
|
37
|
+
# if rms > -5:
|
|
38
|
+
# print(wav_path, np.round(rms, 2))
|
|
39
|
+
return rms
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
sr = 16000
|
|
43
|
+
wav_dir_list = [
|
|
44
|
+
"/data/never/Dataset/kws_data/Command_Word/Crowdsourcing/en_kws2/train/RealPerson",
|
|
45
|
+
"/data/never/Dataset/kws_data/Command_Word/Crowdsourcing/en_kws2/val/RealPerson",
|
|
46
|
+
"/data/never/Dataset/kws_data/Command_Word/Crowdsourcing/en_kws2/test/RealPerson",
|
|
47
|
+
]
|
|
48
|
+
wav_path_list = []
|
|
49
|
+
for wav_dir in wav_dir_list:
|
|
50
|
+
wav_path_list.extend(get_path_list(wav_dir, end="*.wav"))
|
|
51
|
+
|
|
52
|
+
rms_list = Parallel(n_jobs=64)(delayed(get_rms_vad)(wav_path) for wav_path in wav_path_list)
|
|
53
|
+
|
|
54
|
+
# 绘制时长分布直方图
|
|
55
|
+
plt.hist(rms_list, bins=100, edgecolor='black')
|
|
56
|
+
plt.title("RMS Distribution")
|
|
57
|
+
plt.xlabel("RMS (dB)")
|
|
58
|
+
plt.ylabel("number")
|
|
59
|
+
plt.grid(True)
|
|
60
|
+
plt.tight_layout()
|
|
61
|
+
plt.savefig("./png_dist/rms_distribution.png")
|
|
62
|
+
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
"""
|
|
2
|
+
频域分析模块
|
|
3
|
+
Spectral Analysis Module
|
|
4
|
+
|
|
5
|
+
提供音频频域特征提取和分析功能
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import librosa
|
|
10
|
+
import scipy.signal
|
|
11
|
+
from scipy.fft import fft, fftfreq
|
|
12
|
+
from typing import Tuple, Optional, Union
|
|
13
|
+
import warnings
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class SpectralAnalyzer:
|
|
17
|
+
"""频谱分析器类"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, sr: int = 22050, n_fft: int = 2048, hop_length: int = 512):
|
|
20
|
+
"""
|
|
21
|
+
初始化频谱分析器
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
sr: 采样率
|
|
25
|
+
n_fft: FFT窗口大小
|
|
26
|
+
hop_length: 跳跃长度
|
|
27
|
+
"""
|
|
28
|
+
self.sr = sr
|
|
29
|
+
self.n_fft = n_fft
|
|
30
|
+
self.hop_length = hop_length
|
|
31
|
+
|
|
32
|
+
def compute_stft(self, audio: np.ndarray) -> np.ndarray:
|
|
33
|
+
"""
|
|
34
|
+
计算短时傅里叶变换
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
audio: 音频信号
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
STFT结果
|
|
41
|
+
"""
|
|
42
|
+
return librosa.stft(audio, n_fft=self.n_fft, hop_length=self.hop_length)
|
|
43
|
+
|
|
44
|
+
def compute_magnitude_spectrum(self, audio: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
|
45
|
+
"""
|
|
46
|
+
计算幅度谱
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
audio: 音频信号
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
频率轴, 幅度谱
|
|
53
|
+
"""
|
|
54
|
+
spectrum = fft(audio)
|
|
55
|
+
magnitude = np.abs(spectrum)
|
|
56
|
+
freqs = fftfreq(len(audio), 1/self.sr)
|
|
57
|
+
|
|
58
|
+
# 只返回正频率部分
|
|
59
|
+
positive_freq_idx = freqs >= 0
|
|
60
|
+
return freqs[positive_freq_idx], magnitude[positive_freq_idx]
|
|
61
|
+
|
|
62
|
+
def spectral_centroid(self, audio: np.ndarray) -> np.ndarray:
|
|
63
|
+
"""
|
|
64
|
+
计算谱重心
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
audio: 音频信号
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
谱重心数组
|
|
71
|
+
"""
|
|
72
|
+
return librosa.feature.spectral_centroid(
|
|
73
|
+
y=audio, sr=self.sr, hop_length=self.hop_length
|
|
74
|
+
)[0]
|
|
75
|
+
|
|
76
|
+
def spectral_rolloff(self, audio: np.ndarray, roll_percent: float = 0.85) -> np.ndarray:
|
|
77
|
+
"""
|
|
78
|
+
计算谱滚降
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
audio: 音频信号
|
|
82
|
+
roll_percent: 滚降百分比
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
谱滚降数组
|
|
86
|
+
"""
|
|
87
|
+
return librosa.feature.spectral_rolloff(
|
|
88
|
+
y=audio, sr=self.sr, hop_length=self.hop_length, roll_percent=roll_percent
|
|
89
|
+
)[0]
|
|
90
|
+
|
|
91
|
+
def spectral_flatness(self, audio: np.ndarray) -> np.ndarray:
|
|
92
|
+
"""
|
|
93
|
+
计算谱平坦度
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
audio: 音频信号
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
谱平坦度数组
|
|
100
|
+
"""
|
|
101
|
+
return librosa.feature.spectral_flatness(
|
|
102
|
+
y=audio, hop_length=self.hop_length
|
|
103
|
+
)[0]
|
|
104
|
+
|
|
105
|
+
def spectral_contrast(self, audio: np.ndarray, n_bands: int = 6) -> np.ndarray:
|
|
106
|
+
"""
|
|
107
|
+
计算谱对比度
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
audio: 音频信号
|
|
111
|
+
n_bands: 频段数量
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
谱对比度矩阵
|
|
115
|
+
"""
|
|
116
|
+
return librosa.feature.spectral_contrast(
|
|
117
|
+
y=audio, sr=self.sr, hop_length=self.hop_length, n_bands=n_bands
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
def mfcc_features(self, audio: np.ndarray, n_mfcc: int = 13) -> np.ndarray:
|
|
121
|
+
"""
|
|
122
|
+
提取MFCC特征
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
audio: 音频信号
|
|
126
|
+
n_mfcc: MFCC系数数量
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
MFCC特征矩阵
|
|
130
|
+
"""
|
|
131
|
+
return librosa.feature.mfcc(
|
|
132
|
+
y=audio, sr=self.sr, n_mfcc=n_mfcc, hop_length=self.hop_length
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
def mel_spectrogram(self, audio: np.ndarray, n_mels: int = 128) -> np.ndarray:
|
|
136
|
+
"""
|
|
137
|
+
计算梅尔频谱图
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
audio: 音频信号
|
|
141
|
+
n_mels: 梅尔滤波器组数量
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
梅尔频谱图
|
|
145
|
+
"""
|
|
146
|
+
return librosa.feature.melspectrogram(
|
|
147
|
+
y=audio, sr=self.sr, n_mels=n_mels, hop_length=self.hop_length
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
def chroma_features(self, audio: np.ndarray) -> np.ndarray:
|
|
151
|
+
"""
|
|
152
|
+
提取色度特征
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
audio: 音频信号
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
色度特征矩阵
|
|
159
|
+
"""
|
|
160
|
+
return librosa.feature.chroma_stft(
|
|
161
|
+
y=audio, sr=self.sr, hop_length=self.hop_length
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def compute_spectral_features(audio: np.ndarray, sr: int = 22050) -> dict:
|
|
166
|
+
"""
|
|
167
|
+
计算完整的频域特征集合
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
audio: 音频信号
|
|
171
|
+
sr: 采样率
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
包含各种频域特征的字典
|
|
175
|
+
"""
|
|
176
|
+
analyzer = SpectralAnalyzer(sr=sr)
|
|
177
|
+
|
|
178
|
+
features = {
|
|
179
|
+
'spectral_centroid': analyzer.spectral_centroid(audio),
|
|
180
|
+
'spectral_rolloff': analyzer.spectral_rolloff(audio),
|
|
181
|
+
'spectral_flatness': analyzer.spectral_flatness(audio),
|
|
182
|
+
'spectral_contrast': analyzer.spectral_contrast(audio),
|
|
183
|
+
'mfcc': analyzer.mfcc_features(audio),
|
|
184
|
+
'mel_spectrogram': analyzer.mel_spectrogram(audio),
|
|
185
|
+
'chroma': analyzer.chroma_features(audio)
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
return features
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def frequency_domain_stats(audio: np.ndarray, sr: int = 22050) -> dict:
|
|
192
|
+
"""
|
|
193
|
+
计算频域统计信息
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
audio: 音频信号
|
|
197
|
+
sr: 采样率
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
频域统计信息字典
|
|
201
|
+
"""
|
|
202
|
+
analyzer = SpectralAnalyzer(sr=sr)
|
|
203
|
+
freqs, magnitude = analyzer.compute_magnitude_spectrum(audio)
|
|
204
|
+
|
|
205
|
+
# 计算功率谱密度
|
|
206
|
+
power = magnitude ** 2
|
|
207
|
+
|
|
208
|
+
# 计算统计量
|
|
209
|
+
stats = {
|
|
210
|
+
'mean_frequency': np.average(freqs, weights=power),
|
|
211
|
+
'std_frequency': np.sqrt(np.average((freqs - np.average(freqs, weights=power))**2, weights=power)),
|
|
212
|
+
'peak_frequency': freqs[np.argmax(magnitude)],
|
|
213
|
+
'bandwidth': freqs[np.where(power > 0.5 * np.max(power))][-1] - freqs[np.where(power > 0.5 * np.max(power))][0],
|
|
214
|
+
'spectral_energy': np.sum(power),
|
|
215
|
+
'spectral_entropy': -np.sum((power/np.sum(power)) * np.log2(power/np.sum(power) + 1e-10))
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
return stats
|