neverlib 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- neverlib/.claude/settings.local.json +9 -0
- neverlib/Docs/audio_aug/test_volume.ipynb +416 -0
- neverlib/Docs/audio_aug_test/test_volume.ipynb +289 -0
- neverlib/Docs/filter/biquad.ipynb +129 -0
- neverlib/Docs/filter/filter_family.ipynb +450 -0
- neverlib/Docs/filter/highpass.ipynb +139 -0
- neverlib/Docs/filter/scipy_filter_family.ipynb +110 -0
- neverlib/Docs/vad/VAD_Energy.ipynb +167 -0
- neverlib/Docs/vad/VAD_Silero.ipynb +325 -0
- neverlib/Docs/vad/VAD_WebRTC.ipynb +189 -0
- neverlib/Docs/vad/VAD_funasr.ipynb +192 -0
- neverlib/Docs/vad/VAD_rvADfast.ipynb +162 -0
- neverlib/Docs/vad/VAD_statistics.ipynb +532 -0
- neverlib/Docs/vad/VAD_tenVAD.ipynb +292 -0
- neverlib/Docs/vad/VAD_vadlib.ipynb +168 -0
- neverlib/Docs/vad/VAD_whisper.ipynb +404 -0
- neverlib/QA/gen_init.py +117 -0
- neverlib/QA/get_fun.py +19 -0
- neverlib/__init__.py +21 -4
- neverlib/audio_aug/HarmonicDistortion.py +19 -13
- neverlib/audio_aug/__init__.py +30 -12
- neverlib/audio_aug/audio_aug.py +19 -14
- neverlib/audio_aug/clip_aug.py +15 -18
- neverlib/audio_aug/coder_aug.py +44 -24
- neverlib/audio_aug/coder_aug2.py +54 -37
- neverlib/audio_aug/loss_packet_aug.py +7 -7
- neverlib/audio_aug/quant_aug.py +19 -17
- neverlib/data/000_short_enhance.wav +0 -0
- neverlib/data/3956_speech.wav +0 -0
- neverlib/data/3956_sweep.wav +0 -0
- neverlib/data/vad_example.wav +0 -0
- neverlib/data/white.wav +0 -0
- neverlib/data/white_EQ.wav +0 -0
- neverlib/data/white_matched.wav +0 -0
- neverlib/data_analyze/__init__.py +25 -20
- neverlib/data_analyze/dataset_analyzer.py +109 -114
- neverlib/data_analyze/quality_metrics.py +87 -89
- neverlib/data_analyze/rms_distrubution.py +23 -42
- neverlib/data_analyze/spectral_analysis.py +43 -46
- neverlib/data_analyze/statistics.py +76 -76
- neverlib/data_analyze/temporal_features.py +15 -6
- neverlib/data_analyze/visualization.py +208 -144
- neverlib/filter/__init__.py +17 -20
- neverlib/filter/auto_eq/__init__.py +18 -35
- neverlib/filter/auto_eq/de_eq.py +0 -2
- neverlib/filter/common.py +24 -5
- neverlib/metrics/DNSMOS/bak_ovr.onnx +0 -0
- neverlib/metrics/DNSMOS/model_v8.onnx +0 -0
- neverlib/metrics/DNSMOS/sig.onnx +0 -0
- neverlib/metrics/DNSMOS/sig_bak_ovr.onnx +0 -0
- neverlib/metrics/__init__.py +23 -0
- neverlib/metrics/dnsmos.py +4 -15
- neverlib/metrics/pDNSMOS/sig_bak_ovr.onnx +0 -0
- neverlib/metrics/pesq_c/PESQ +0 -0
- neverlib/metrics/pesq_c/dsp.c +553 -0
- neverlib/metrics/pesq_c/dsp.h +138 -0
- neverlib/metrics/pesq_c/pesq.h +294 -0
- neverlib/metrics/pesq_c/pesqdsp.c +1047 -0
- neverlib/metrics/pesq_c/pesqio.c +392 -0
- neverlib/metrics/pesq_c/pesqmain.c +610 -0
- neverlib/metrics/pesq_c/pesqmod.c +1417 -0
- neverlib/metrics/pesq_c/pesqpar.h +297 -0
- neverlib/metrics/snr.py +5 -1
- neverlib/metrics/spec.py +31 -21
- neverlib/metrics/test_pesq.py +0 -4
- neverlib/tests/test_imports.py +17 -0
- neverlib/utils/__init__.py +26 -15
- neverlib/utils/audio_split.py +5 -1
- neverlib/utils/checkGPU.py +17 -9
- neverlib/utils/lazy_expose.py +29 -0
- neverlib/utils/utils.py +40 -12
- neverlib/vad/__init__.py +33 -25
- neverlib/vad/class_get_speech.py +1 -1
- neverlib/vad/class_vad.py +3 -3
- neverlib/vad/img.png +0 -0
- {neverlib-0.2.5.dist-info → neverlib-0.2.7.dist-info}/METADATA +20 -17
- {neverlib-0.2.5.dist-info → neverlib-0.2.7.dist-info}/RECORD +80 -37
- {neverlib-0.2.5.dist-info → neverlib-0.2.7.dist-info}/WHEEL +0 -0
- {neverlib-0.2.5.dist-info → neverlib-0.2.7.dist-info}/licenses/LICENSE +0 -0
- {neverlib-0.2.5.dist-info → neverlib-0.2.7.dist-info}/top_level.txt +0 -0
|
@@ -4,40 +4,38 @@ Audio Quality Metrics Module
|
|
|
4
4
|
|
|
5
5
|
提供音频质量评估和失真度分析功能
|
|
6
6
|
"""
|
|
7
|
-
|
|
8
|
-
import numpy as np
|
|
9
7
|
import librosa
|
|
8
|
+
import numpy as np
|
|
10
9
|
from scipy import signal
|
|
11
10
|
from scipy.fft import fft, fftfreq
|
|
12
11
|
from typing import Tuple, Optional, Union, List
|
|
13
|
-
import warnings
|
|
14
12
|
|
|
15
13
|
|
|
16
14
|
class QualityAnalyzer:
|
|
17
15
|
"""音频质量分析器类"""
|
|
18
|
-
|
|
16
|
+
|
|
19
17
|
def __init__(self, sr: int = 22050):
|
|
20
18
|
"""
|
|
21
19
|
初始化质量分析器
|
|
22
|
-
|
|
20
|
+
|
|
23
21
|
Args:
|
|
24
22
|
sr: 采样率
|
|
25
23
|
"""
|
|
26
24
|
self.sr = sr
|
|
27
|
-
|
|
28
|
-
def signal_to_noise_ratio(self, signal_audio: np.ndarray,
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
25
|
+
|
|
26
|
+
def signal_to_noise_ratio(self, signal_audio: np.ndarray,
|
|
27
|
+
noise_audio: Optional[np.ndarray] = None,
|
|
28
|
+
signal_start: Optional[int] = None,
|
|
29
|
+
signal_end: Optional[int] = None) -> float:
|
|
32
30
|
"""
|
|
33
31
|
计算信噪比 (SNR)
|
|
34
|
-
|
|
32
|
+
|
|
35
33
|
Args:
|
|
36
34
|
signal_audio: 含有信号和噪声的音频
|
|
37
35
|
noise_audio: 纯噪声音频(可选)
|
|
38
36
|
signal_start: 信号开始位置(当噪声未单独提供时使用)
|
|
39
37
|
signal_end: 信号结束位置(当噪声未单独提供时使用)
|
|
40
|
-
|
|
38
|
+
|
|
41
39
|
Returns:
|
|
42
40
|
SNR值(dB)
|
|
43
41
|
"""
|
|
@@ -49,61 +47,61 @@ class QualityAnalyzer:
|
|
|
49
47
|
# 从音频中提取信号和噪声部分
|
|
50
48
|
if signal_start is None or signal_end is None:
|
|
51
49
|
raise ValueError("Must provide signal_start and signal_end when noise_audio is None")
|
|
52
|
-
|
|
50
|
+
|
|
53
51
|
signal_part = signal_audio[signal_start:signal_end]
|
|
54
|
-
|
|
52
|
+
|
|
55
53
|
# 假设开头和结尾是噪声
|
|
56
54
|
noise_start = signal_audio[:signal_start] if signal_start > 0 else np.array([])
|
|
57
55
|
noise_end = signal_audio[signal_end:] if signal_end < len(signal_audio) else np.array([])
|
|
58
56
|
noise_part = np.concatenate([noise_start, noise_end]) if len(noise_start) > 0 or len(noise_end) > 0 else signal_audio[:1000]
|
|
59
|
-
|
|
57
|
+
|
|
60
58
|
signal_power = np.mean(signal_part ** 2)
|
|
61
59
|
noise_power = np.mean(noise_part ** 2)
|
|
62
|
-
|
|
60
|
+
|
|
63
61
|
if noise_power == 0:
|
|
64
62
|
return float('inf')
|
|
65
|
-
|
|
63
|
+
|
|
66
64
|
snr_db = 10 * np.log10(signal_power / noise_power)
|
|
67
65
|
return snr_db
|
|
68
|
-
|
|
69
|
-
def total_harmonic_distortion(self, audio: np.ndarray,
|
|
70
|
-
|
|
71
|
-
|
|
66
|
+
|
|
67
|
+
def total_harmonic_distortion(self, audio: np.ndarray,
|
|
68
|
+
fundamental_freq: Optional[float] = None,
|
|
69
|
+
num_harmonics: int = 5) -> float:
|
|
72
70
|
"""
|
|
73
71
|
计算总谐波失真 (THD)
|
|
74
|
-
|
|
72
|
+
|
|
75
73
|
Args:
|
|
76
74
|
audio: 音频信号
|
|
77
75
|
fundamental_freq: 基频(Hz), 如果不提供则自动检测
|
|
78
76
|
num_harmonics: 考虑的谐波数量
|
|
79
|
-
|
|
77
|
+
|
|
80
78
|
Returns:
|
|
81
79
|
THD百分比
|
|
82
80
|
"""
|
|
83
81
|
# 计算频谱
|
|
84
82
|
spectrum = fft(audio)
|
|
85
|
-
freqs = fftfreq(len(audio), 1/self.sr)
|
|
83
|
+
freqs = fftfreq(len(audio), 1 / self.sr)
|
|
86
84
|
magnitude = np.abs(spectrum)
|
|
87
|
-
|
|
85
|
+
|
|
88
86
|
# 只考虑正频率
|
|
89
87
|
positive_idx = freqs > 0
|
|
90
88
|
freqs = freqs[positive_idx]
|
|
91
89
|
magnitude = magnitude[positive_idx]
|
|
92
|
-
|
|
90
|
+
|
|
93
91
|
# 如果没有提供基频, 自动检测
|
|
94
92
|
if fundamental_freq is None:
|
|
95
93
|
fundamental_freq = freqs[np.argmax(magnitude)]
|
|
96
|
-
|
|
94
|
+
|
|
97
95
|
# 找到基频和谐波的功率
|
|
98
96
|
tolerance = fundamental_freq * 0.05 # 5%的容差
|
|
99
|
-
|
|
97
|
+
|
|
100
98
|
# 基频功率
|
|
101
99
|
fundamental_idx = np.where(np.abs(freqs - fundamental_freq) < tolerance)[0]
|
|
102
100
|
if len(fundamental_idx) == 0:
|
|
103
101
|
return 0.0
|
|
104
|
-
|
|
102
|
+
|
|
105
103
|
fundamental_power = np.max(magnitude[fundamental_idx]) ** 2
|
|
106
|
-
|
|
104
|
+
|
|
107
105
|
# 谐波功率
|
|
108
106
|
harmonic_power = 0
|
|
109
107
|
for h in range(2, num_harmonics + 2):
|
|
@@ -111,97 +109,97 @@ class QualityAnalyzer:
|
|
|
111
109
|
harmonic_idx = np.where(np.abs(freqs - harmonic_freq) < tolerance)[0]
|
|
112
110
|
if len(harmonic_idx) > 0:
|
|
113
111
|
harmonic_power += np.max(magnitude[harmonic_idx]) ** 2
|
|
114
|
-
|
|
112
|
+
|
|
115
113
|
if fundamental_power == 0:
|
|
116
114
|
return 0.0
|
|
117
|
-
|
|
115
|
+
|
|
118
116
|
thd = np.sqrt(harmonic_power / fundamental_power) * 100
|
|
119
117
|
return thd
|
|
120
|
-
|
|
121
|
-
def dynamic_range(self, audio: np.ndarray, percentile_low: float = 1,
|
|
122
|
-
|
|
118
|
+
|
|
119
|
+
def dynamic_range(self, audio: np.ndarray, percentile_low: float = 1,
|
|
120
|
+
percentile_high: float = 99) -> float:
|
|
123
121
|
"""
|
|
124
122
|
计算动态范围
|
|
125
|
-
|
|
123
|
+
|
|
126
124
|
Args:
|
|
127
125
|
audio: 音频信号
|
|
128
126
|
percentile_low: 低百分位数
|
|
129
127
|
percentile_high: 高百分位数
|
|
130
|
-
|
|
128
|
+
|
|
131
129
|
Returns:
|
|
132
130
|
动态范围(dB)
|
|
133
131
|
"""
|
|
134
132
|
amplitude = np.abs(audio)
|
|
135
133
|
amplitude = amplitude[amplitude > 0] # 避免log(0)
|
|
136
|
-
|
|
134
|
+
|
|
137
135
|
if len(amplitude) == 0:
|
|
138
136
|
return 0.0
|
|
139
|
-
|
|
137
|
+
|
|
140
138
|
low_level = np.percentile(amplitude, percentile_low)
|
|
141
139
|
high_level = np.percentile(amplitude, percentile_high)
|
|
142
|
-
|
|
140
|
+
|
|
143
141
|
dynamic_range_db = 20 * np.log10(high_level / (low_level + 1e-10))
|
|
144
142
|
return dynamic_range_db
|
|
145
|
-
|
|
146
|
-
def frequency_response(self, audio: np.ndarray,
|
|
147
|
-
|
|
143
|
+
|
|
144
|
+
def frequency_response(self, audio: np.ndarray,
|
|
145
|
+
reference_audio: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]:
|
|
148
146
|
"""
|
|
149
147
|
计算频率响应特性
|
|
150
|
-
|
|
148
|
+
|
|
151
149
|
Args:
|
|
152
150
|
audio: 测试音频信号
|
|
153
151
|
reference_audio: 参考音频信号(可选)
|
|
154
|
-
|
|
152
|
+
|
|
155
153
|
Returns:
|
|
156
154
|
(频率数组, 幅度响应数组)
|
|
157
155
|
"""
|
|
158
156
|
if reference_audio is not None:
|
|
159
157
|
# 计算传递函数
|
|
160
158
|
freqs, h = signal.freqz_zpk(*signal.tf2zpk([1], [1]), fs=self.sr)
|
|
161
|
-
|
|
159
|
+
|
|
162
160
|
# 使用互相关计算频率响应
|
|
163
161
|
cross_corr = signal.correlate(audio, reference_audio, mode='full')
|
|
164
162
|
auto_corr = signal.correlate(reference_audio, reference_audio, mode='full')
|
|
165
|
-
|
|
163
|
+
|
|
166
164
|
# 频域除法得到传递函数
|
|
167
165
|
cross_spectrum = fft(cross_corr)
|
|
168
166
|
auto_spectrum = fft(auto_corr)
|
|
169
|
-
|
|
167
|
+
|
|
170
168
|
h_measured = cross_spectrum / (auto_spectrum + 1e-10)
|
|
171
|
-
freqs = fftfreq(len(h_measured), 1/self.sr)
|
|
172
|
-
|
|
169
|
+
freqs = fftfreq(len(h_measured), 1 / self.sr)
|
|
170
|
+
|
|
173
171
|
# 只取正频率部分
|
|
174
172
|
positive_idx = freqs >= 0
|
|
175
173
|
freqs = freqs[positive_idx]
|
|
176
174
|
h_measured = h_measured[positive_idx]
|
|
177
|
-
|
|
175
|
+
|
|
178
176
|
return freqs, np.abs(h_measured)
|
|
179
177
|
else:
|
|
180
178
|
# 直接返回频谱
|
|
181
179
|
spectrum = fft(audio)
|
|
182
|
-
freqs = fftfreq(len(audio), 1/self.sr)
|
|
183
|
-
|
|
180
|
+
freqs = fftfreq(len(audio), 1 / self.sr)
|
|
181
|
+
|
|
184
182
|
positive_idx = freqs >= 0
|
|
185
183
|
freqs = freqs[positive_idx]
|
|
186
184
|
spectrum = spectrum[positive_idx]
|
|
187
|
-
|
|
185
|
+
|
|
188
186
|
return freqs, np.abs(spectrum)
|
|
189
|
-
|
|
187
|
+
|
|
190
188
|
def loudness_range(self, audio: np.ndarray, gate_threshold: float = -70) -> dict:
|
|
191
189
|
"""
|
|
192
190
|
计算响度范围(基于EBU R128标准的简化版本)
|
|
193
|
-
|
|
191
|
+
|
|
194
192
|
Args:
|
|
195
193
|
audio: 音频信号
|
|
196
194
|
gate_threshold: 门限阈值(dB)
|
|
197
|
-
|
|
195
|
+
|
|
198
196
|
Returns:
|
|
199
197
|
响度统计信息字典
|
|
200
198
|
"""
|
|
201
199
|
# 分块计算短时响度
|
|
202
200
|
block_size = int(0.4 * self.sr) # 400ms块
|
|
203
201
|
hop_size = int(0.1 * self.sr) # 100ms跳跃
|
|
204
|
-
|
|
202
|
+
|
|
205
203
|
blocks = []
|
|
206
204
|
for i in range(0, len(audio) - block_size, hop_size):
|
|
207
205
|
block = audio[i:i + block_size]
|
|
@@ -211,31 +209,31 @@ class QualityAnalyzer:
|
|
|
211
209
|
loudness = 20 * np.log10(rms)
|
|
212
210
|
if loudness > gate_threshold:
|
|
213
211
|
blocks.append(loudness)
|
|
214
|
-
|
|
212
|
+
|
|
215
213
|
if len(blocks) == 0:
|
|
216
214
|
return {'integrated_loudness': -float('inf'), 'loudness_range': 0, 'max_loudness': -float('inf')}
|
|
217
|
-
|
|
215
|
+
|
|
218
216
|
blocks = np.array(blocks)
|
|
219
|
-
|
|
217
|
+
|
|
220
218
|
# 计算统计量
|
|
221
219
|
integrated_loudness = np.mean(blocks)
|
|
222
220
|
loudness_range = np.percentile(blocks, 95) - np.percentile(blocks, 10)
|
|
223
221
|
max_loudness = np.max(blocks)
|
|
224
|
-
|
|
222
|
+
|
|
225
223
|
return {
|
|
226
224
|
'integrated_loudness': integrated_loudness,
|
|
227
225
|
'loudness_range': loudness_range,
|
|
228
226
|
'max_loudness': max_loudness
|
|
229
227
|
}
|
|
230
|
-
|
|
228
|
+
|
|
231
229
|
def spectral_distortion(self, original: np.ndarray, processed: np.ndarray) -> float:
|
|
232
230
|
"""
|
|
233
231
|
计算谱失真度
|
|
234
|
-
|
|
232
|
+
|
|
235
233
|
Args:
|
|
236
234
|
original: 原始音频
|
|
237
235
|
processed: 处理后音频
|
|
238
|
-
|
|
236
|
+
|
|
239
237
|
Returns:
|
|
240
238
|
谱失真度(dB)
|
|
241
239
|
"""
|
|
@@ -243,48 +241,48 @@ class QualityAnalyzer:
|
|
|
243
241
|
min_len = min(len(original), len(processed))
|
|
244
242
|
original = original[:min_len]
|
|
245
243
|
processed = processed[:min_len]
|
|
246
|
-
|
|
244
|
+
|
|
247
245
|
# 计算频谱
|
|
248
246
|
orig_spectrum = np.abs(fft(original))
|
|
249
247
|
proc_spectrum = np.abs(fft(processed))
|
|
250
|
-
|
|
248
|
+
|
|
251
249
|
# 计算谱失真
|
|
252
250
|
mse = np.mean((orig_spectrum - proc_spectrum) ** 2)
|
|
253
251
|
orig_power = np.mean(orig_spectrum ** 2)
|
|
254
|
-
|
|
252
|
+
|
|
255
253
|
if orig_power == 0:
|
|
256
254
|
return float('inf')
|
|
257
|
-
|
|
255
|
+
|
|
258
256
|
distortion_db = 10 * np.log10(mse / orig_power)
|
|
259
257
|
return distortion_db
|
|
260
258
|
|
|
261
259
|
|
|
262
|
-
def comprehensive_quality_assessment(audio: np.ndarray, sr: int = 22050,
|
|
263
|
-
|
|
260
|
+
def comprehensive_quality_assessment(audio: np.ndarray, sr: int = 22050,
|
|
261
|
+
reference: Optional[np.ndarray] = None) -> dict:
|
|
264
262
|
"""
|
|
265
263
|
综合质量评估
|
|
266
|
-
|
|
264
|
+
|
|
267
265
|
Args:
|
|
268
266
|
audio: 待评估音频
|
|
269
267
|
sr: 采样率
|
|
270
268
|
reference: 参考音频(可选)
|
|
271
|
-
|
|
269
|
+
|
|
272
270
|
Returns:
|
|
273
271
|
质量评估结果字典
|
|
274
272
|
"""
|
|
275
273
|
analyzer = QualityAnalyzer(sr=sr)
|
|
276
|
-
|
|
274
|
+
|
|
277
275
|
results = {
|
|
278
276
|
'dynamic_range': analyzer.dynamic_range(audio),
|
|
279
277
|
'loudness_stats': analyzer.loudness_range(audio),
|
|
280
278
|
}
|
|
281
|
-
|
|
279
|
+
|
|
282
280
|
# 尝试计算THD
|
|
283
281
|
try:
|
|
284
282
|
results['thd'] = analyzer.total_harmonic_distortion(audio)
|
|
285
283
|
except:
|
|
286
284
|
results['thd'] = None
|
|
287
|
-
|
|
285
|
+
|
|
288
286
|
# 如果有参考音频, 计算比较指标
|
|
289
287
|
if reference is not None:
|
|
290
288
|
try:
|
|
@@ -293,7 +291,7 @@ def comprehensive_quality_assessment(audio: np.ndarray, sr: int = 22050,
|
|
|
293
291
|
except:
|
|
294
292
|
results['snr'] = None
|
|
295
293
|
results['spectral_distortion'] = None
|
|
296
|
-
|
|
294
|
+
|
|
297
295
|
# 频率响应
|
|
298
296
|
try:
|
|
299
297
|
freqs, response = analyzer.frequency_response(audio, reference)
|
|
@@ -303,18 +301,18 @@ def comprehensive_quality_assessment(audio: np.ndarray, sr: int = 22050,
|
|
|
303
301
|
}
|
|
304
302
|
except:
|
|
305
303
|
results['frequency_response'] = None
|
|
306
|
-
|
|
304
|
+
|
|
307
305
|
return results
|
|
308
306
|
|
|
309
307
|
|
|
310
308
|
def audio_health_check(audio: np.ndarray, sr: int = 22050) -> dict:
|
|
311
309
|
"""
|
|
312
310
|
音频健康检查
|
|
313
|
-
|
|
311
|
+
|
|
314
312
|
Args:
|
|
315
313
|
audio: 音频信号
|
|
316
314
|
sr: 采样率
|
|
317
|
-
|
|
315
|
+
|
|
318
316
|
Returns:
|
|
319
317
|
健康检查结果
|
|
320
318
|
"""
|
|
@@ -323,36 +321,36 @@ def audio_health_check(audio: np.ndarray, sr: int = 22050) -> dict:
|
|
|
323
321
|
'warnings': [],
|
|
324
322
|
'stats': {}
|
|
325
323
|
}
|
|
326
|
-
|
|
324
|
+
|
|
327
325
|
# 基础统计
|
|
328
326
|
max_amplitude = np.max(np.abs(audio))
|
|
329
327
|
min_amplitude = np.min(np.abs(audio))
|
|
330
328
|
mean_amplitude = np.mean(np.abs(audio))
|
|
331
|
-
|
|
329
|
+
|
|
332
330
|
health_report['stats'] = {
|
|
333
331
|
'max_amplitude': max_amplitude,
|
|
334
332
|
'min_amplitude': min_amplitude,
|
|
335
333
|
'mean_amplitude': mean_amplitude,
|
|
336
334
|
'duration': len(audio) / sr
|
|
337
335
|
}
|
|
338
|
-
|
|
336
|
+
|
|
339
337
|
# 检查削波
|
|
340
338
|
if max_amplitude >= 0.99:
|
|
341
339
|
health_report['issues'].append('Potential clipping detected')
|
|
342
|
-
|
|
340
|
+
|
|
343
341
|
# 检查过低音量
|
|
344
342
|
if max_amplitude < 0.01:
|
|
345
343
|
health_report['warnings'].append('Very low signal level')
|
|
346
|
-
|
|
344
|
+
|
|
347
345
|
# 检查静音
|
|
348
346
|
if mean_amplitude < 1e-6:
|
|
349
347
|
health_report['issues'].append('Signal appears to be silent')
|
|
350
|
-
|
|
348
|
+
|
|
351
349
|
# 检查DC偏移
|
|
352
350
|
dc_offset = np.mean(audio)
|
|
353
351
|
if abs(dc_offset) > 0.01:
|
|
354
352
|
health_report['warnings'].append(f'DC offset detected: {dc_offset:.4f}')
|
|
355
|
-
|
|
353
|
+
|
|
356
354
|
# 检查动态范围
|
|
357
355
|
analyzer = QualityAnalyzer(sr=sr)
|
|
358
356
|
dynamic_range = analyzer.dynamic_range(audio)
|
|
@@ -360,5 +358,5 @@ def audio_health_check(audio: np.ndarray, sr: int = 22050) -> dict:
|
|
|
360
358
|
health_report['warnings'].append('Low dynamic range')
|
|
361
359
|
elif dynamic_range > 60:
|
|
362
360
|
health_report['warnings'].append('Very high dynamic range - check for noise')
|
|
363
|
-
|
|
364
|
-
return health_report
|
|
361
|
+
|
|
362
|
+
return health_report
|
|
@@ -1,32 +1,13 @@
|
|
|
1
1
|
'''
|
|
2
2
|
Author: 凌逆战 | Never
|
|
3
3
|
Date: 2025-03-26 22:13:22
|
|
4
|
-
Description:
|
|
4
|
+
Description: 统计音频语音段rms值分布
|
|
5
5
|
'''
|
|
6
|
-
# -*- coding:utf-8 -*-
|
|
7
|
-
# Author:凌逆战 | Never
|
|
8
|
-
# Date: 2025/3/2
|
|
9
|
-
"""
|
|
10
|
-
统计音频语音段rms值分布
|
|
11
|
-
"""
|
|
12
|
-
import sys
|
|
13
|
-
sys.path.append("../../../")
|
|
14
|
-
import torch
|
|
15
6
|
import soundfile as sf
|
|
7
|
+
from .temporal_features import rms_amplitude
|
|
16
8
|
from neverlib.utils import get_path_list
|
|
17
|
-
from neverlib.filter import HPFilter
|
|
18
|
-
from neverlib.audio_aug import volume_norm
|
|
19
|
-
from neverlib.dataAnalyze.utils import rms_amplitude
|
|
20
|
-
from joblib import Parallel, delayed
|
|
21
|
-
import matplotlib.pyplot as plt
|
|
22
|
-
import numpy as np
|
|
23
|
-
import librosa
|
|
24
|
-
import os
|
|
25
|
-
from utils.train_utils import from_path_get_vadpoint
|
|
26
9
|
|
|
27
10
|
|
|
28
|
-
|
|
29
|
-
|
|
30
11
|
def get_rms_vad(wav_path):
|
|
31
12
|
wav, wav_sr = sf.read(wav_path, always_2d=True) # (xxx,ch)
|
|
32
13
|
assert wav_sr == sr, f"期望采样率为{sr}, 但是为{wav_sr}, 文件名: {wav_path}"
|
|
@@ -39,24 +20,24 @@ def get_rms_vad(wav_path):
|
|
|
39
20
|
return rms
|
|
40
21
|
|
|
41
22
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
plt.
|
|
57
|
-
plt.
|
|
58
|
-
plt.
|
|
59
|
-
plt.
|
|
60
|
-
plt.
|
|
61
|
-
plt.
|
|
62
|
-
|
|
23
|
+
if __name__ == "__main__":
|
|
24
|
+
sr = 16000
|
|
25
|
+
wav_dir_list = [
|
|
26
|
+
"/data/never/Dataset/kws_data/Command_Word/Crowdsourcing/en_kws2/train/RealPerson",
|
|
27
|
+
"/data/never/Dataset/kws_data/Command_Word/Crowdsourcing/en_kws2/val/RealPerson",
|
|
28
|
+
"/data/never/Dataset/kws_data/Command_Word/Crowdsourcing/en_kws2/test/RealPerson",
|
|
29
|
+
]
|
|
30
|
+
wav_path_list = []
|
|
31
|
+
for wav_dir in wav_dir_list:
|
|
32
|
+
wav_path_list.extend(get_path_list(wav_dir, end="*.wav"))
|
|
33
|
+
|
|
34
|
+
rms_list = Parallel(n_jobs=64)(delayed(get_rms_vad)(wav_path) for wav_path in wav_path_list)
|
|
35
|
+
|
|
36
|
+
# 绘制时长分布直方图
|
|
37
|
+
plt.hist(rms_list, bins=100, edgecolor='black')
|
|
38
|
+
plt.title("RMS Distribution")
|
|
39
|
+
plt.xlabel("RMS (dB)")
|
|
40
|
+
plt.ylabel("number")
|
|
41
|
+
plt.grid(True)
|
|
42
|
+
plt.tight_layout()
|
|
43
|
+
plt.savefig("./png_dist/rms_distribution.png")
|