neverlib 0.2.6__py3-none-any.whl → 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- neverlib/.claude/settings.local.json +9 -0
- neverlib/Docs/audio_aug/test_volume.ipynb +416 -0
- neverlib/Docs/audio_aug_test/test_volume.ipynb +289 -0
- neverlib/Docs/filter/biquad.ipynb +129 -0
- neverlib/Docs/filter/filter_family.ipynb +450 -0
- neverlib/Docs/filter/highpass.ipynb +139 -0
- neverlib/Docs/filter/scipy_filter_family.ipynb +110 -0
- neverlib/Docs/vad/VAD_Energy.ipynb +167 -0
- neverlib/Docs/vad/VAD_Silero.ipynb +325 -0
- neverlib/Docs/vad/VAD_WebRTC.ipynb +189 -0
- neverlib/Docs/vad/VAD_funasr.ipynb +192 -0
- neverlib/Docs/vad/VAD_rvADfast.ipynb +162 -0
- neverlib/Docs/vad/VAD_statistics.ipynb +532 -0
- neverlib/Docs/vad/VAD_tenVAD.ipynb +292 -0
- neverlib/Docs/vad/VAD_vadlib.ipynb +168 -0
- neverlib/Docs/vad/VAD_whisper.ipynb +404 -0
- neverlib/QA/gen_init.py +117 -0
- neverlib/QA/get_fun.py +19 -0
- neverlib/__init__.py +21 -4
- neverlib/audio_aug/HarmonicDistortion.py +19 -13
- neverlib/audio_aug/__init__.py +30 -12
- neverlib/audio_aug/audio_aug.py +19 -14
- neverlib/audio_aug/clip_aug.py +15 -18
- neverlib/audio_aug/coder_aug.py +44 -24
- neverlib/audio_aug/coder_aug2.py +54 -37
- neverlib/audio_aug/loss_packet_aug.py +7 -7
- neverlib/audio_aug/quant_aug.py +19 -17
- neverlib/data/000_short_enhance.wav +0 -0
- neverlib/data/3956_speech.wav +0 -0
- neverlib/data/3956_sweep.wav +0 -0
- neverlib/data/vad_example.wav +0 -0
- neverlib/data/white.wav +0 -0
- neverlib/data/white_EQ.wav +0 -0
- neverlib/data/white_matched.wav +0 -0
- neverlib/data_analyze/__init__.py +25 -20
- neverlib/data_analyze/dataset_analyzer.py +109 -114
- neverlib/data_analyze/quality_metrics.py +87 -89
- neverlib/data_analyze/rms_distrubution.py +23 -42
- neverlib/data_analyze/spectral_analysis.py +43 -46
- neverlib/data_analyze/statistics.py +76 -76
- neverlib/data_analyze/temporal_features.py +15 -6
- neverlib/data_analyze/visualization.py +208 -144
- neverlib/filter/__init__.py +17 -20
- neverlib/filter/auto_eq/__init__.py +18 -35
- neverlib/filter/auto_eq/de_eq.py +0 -2
- neverlib/filter/common.py +24 -5
- neverlib/metrics/DNSMOS/bak_ovr.onnx +0 -0
- neverlib/metrics/DNSMOS/model_v8.onnx +0 -0
- neverlib/metrics/DNSMOS/sig.onnx +0 -0
- neverlib/metrics/DNSMOS/sig_bak_ovr.onnx +0 -0
- neverlib/metrics/__init__.py +23 -0
- neverlib/metrics/dnsmos.py +4 -15
- neverlib/metrics/pDNSMOS/sig_bak_ovr.onnx +0 -0
- neverlib/metrics/pesq_c/PESQ +0 -0
- neverlib/metrics/pesq_c/dsp.c +553 -0
- neverlib/metrics/pesq_c/dsp.h +138 -0
- neverlib/metrics/pesq_c/pesq.h +294 -0
- neverlib/metrics/pesq_c/pesqdsp.c +1047 -0
- neverlib/metrics/pesq_c/pesqio.c +392 -0
- neverlib/metrics/pesq_c/pesqmain.c +610 -0
- neverlib/metrics/pesq_c/pesqmod.c +1417 -0
- neverlib/metrics/pesq_c/pesqpar.h +297 -0
- neverlib/metrics/snr.py +5 -1
- neverlib/metrics/spec.py +31 -21
- neverlib/metrics/test_pesq.py +0 -4
- neverlib/tests/test_imports.py +17 -0
- neverlib/utils/__init__.py +26 -15
- neverlib/utils/audio_split.py +5 -1
- neverlib/utils/checkGPU.py +17 -9
- neverlib/utils/lazy_expose.py +29 -0
- neverlib/utils/utils.py +40 -12
- neverlib/vad/__init__.py +33 -25
- neverlib/vad/class_get_speech.py +1 -1
- neverlib/vad/class_vad.py +3 -3
- neverlib/vad/img.png +0 -0
- {neverlib-0.2.6.dist-info → neverlib-0.2.7.dist-info}/METADATA +1 -1
- {neverlib-0.2.6.dist-info → neverlib-0.2.7.dist-info}/RECORD +80 -37
- {neverlib-0.2.6.dist-info → neverlib-0.2.7.dist-info}/WHEEL +0 -0
- {neverlib-0.2.6.dist-info → neverlib-0.2.7.dist-info}/licenses/LICENSE +0 -0
- {neverlib-0.2.6.dist-info → neverlib-0.2.7.dist-info}/top_level.txt +0 -0
|
@@ -4,22 +4,19 @@ Spectral Analysis Module
|
|
|
4
4
|
|
|
5
5
|
提供音频频域特征提取和分析功能
|
|
6
6
|
"""
|
|
7
|
-
|
|
8
|
-
import numpy as np
|
|
9
7
|
import librosa
|
|
10
|
-
import
|
|
8
|
+
import numpy as np
|
|
11
9
|
from scipy.fft import fft, fftfreq
|
|
12
10
|
from typing import Tuple, Optional, Union
|
|
13
|
-
import warnings
|
|
14
11
|
|
|
15
12
|
|
|
16
13
|
class SpectralAnalyzer:
|
|
17
14
|
"""频谱分析器类"""
|
|
18
|
-
|
|
15
|
+
|
|
19
16
|
def __init__(self, sr: int = 22050, n_fft: int = 2048, hop_length: int = 512):
|
|
20
17
|
"""
|
|
21
18
|
初始化频谱分析器
|
|
22
|
-
|
|
19
|
+
|
|
23
20
|
Args:
|
|
24
21
|
sr: 采样率
|
|
25
22
|
n_fft: FFT窗口大小
|
|
@@ -28,132 +25,132 @@ class SpectralAnalyzer:
|
|
|
28
25
|
self.sr = sr
|
|
29
26
|
self.n_fft = n_fft
|
|
30
27
|
self.hop_length = hop_length
|
|
31
|
-
|
|
28
|
+
|
|
32
29
|
def compute_stft(self, audio: np.ndarray) -> np.ndarray:
|
|
33
30
|
"""
|
|
34
31
|
计算短时傅里叶变换
|
|
35
|
-
|
|
32
|
+
|
|
36
33
|
Args:
|
|
37
34
|
audio: 音频信号
|
|
38
|
-
|
|
35
|
+
|
|
39
36
|
Returns:
|
|
40
37
|
STFT结果
|
|
41
38
|
"""
|
|
42
39
|
return librosa.stft(audio, n_fft=self.n_fft, hop_length=self.hop_length)
|
|
43
|
-
|
|
40
|
+
|
|
44
41
|
def compute_magnitude_spectrum(self, audio: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
|
45
42
|
"""
|
|
46
43
|
计算幅度谱
|
|
47
|
-
|
|
44
|
+
|
|
48
45
|
Args:
|
|
49
46
|
audio: 音频信号
|
|
50
|
-
|
|
47
|
+
|
|
51
48
|
Returns:
|
|
52
49
|
频率轴, 幅度谱
|
|
53
50
|
"""
|
|
54
51
|
spectrum = fft(audio)
|
|
55
52
|
magnitude = np.abs(spectrum)
|
|
56
|
-
freqs = fftfreq(len(audio), 1/self.sr)
|
|
57
|
-
|
|
53
|
+
freqs = fftfreq(len(audio), 1 / self.sr)
|
|
54
|
+
|
|
58
55
|
# 只返回正频率部分
|
|
59
56
|
positive_freq_idx = freqs >= 0
|
|
60
57
|
return freqs[positive_freq_idx], magnitude[positive_freq_idx]
|
|
61
|
-
|
|
58
|
+
|
|
62
59
|
def spectral_centroid(self, audio: np.ndarray) -> np.ndarray:
|
|
63
60
|
"""
|
|
64
61
|
计算谱重心
|
|
65
|
-
|
|
62
|
+
|
|
66
63
|
Args:
|
|
67
64
|
audio: 音频信号
|
|
68
|
-
|
|
65
|
+
|
|
69
66
|
Returns:
|
|
70
67
|
谱重心数组
|
|
71
68
|
"""
|
|
72
69
|
return librosa.feature.spectral_centroid(
|
|
73
70
|
y=audio, sr=self.sr, hop_length=self.hop_length
|
|
74
71
|
)[0]
|
|
75
|
-
|
|
72
|
+
|
|
76
73
|
def spectral_rolloff(self, audio: np.ndarray, roll_percent: float = 0.85) -> np.ndarray:
|
|
77
74
|
"""
|
|
78
75
|
计算谱滚降
|
|
79
|
-
|
|
76
|
+
|
|
80
77
|
Args:
|
|
81
78
|
audio: 音频信号
|
|
82
79
|
roll_percent: 滚降百分比
|
|
83
|
-
|
|
80
|
+
|
|
84
81
|
Returns:
|
|
85
82
|
谱滚降数组
|
|
86
83
|
"""
|
|
87
84
|
return librosa.feature.spectral_rolloff(
|
|
88
85
|
y=audio, sr=self.sr, hop_length=self.hop_length, roll_percent=roll_percent
|
|
89
86
|
)[0]
|
|
90
|
-
|
|
87
|
+
|
|
91
88
|
def spectral_flatness(self, audio: np.ndarray) -> np.ndarray:
|
|
92
89
|
"""
|
|
93
90
|
计算谱平坦度
|
|
94
|
-
|
|
91
|
+
|
|
95
92
|
Args:
|
|
96
93
|
audio: 音频信号
|
|
97
|
-
|
|
94
|
+
|
|
98
95
|
Returns:
|
|
99
96
|
谱平坦度数组
|
|
100
97
|
"""
|
|
101
98
|
return librosa.feature.spectral_flatness(
|
|
102
99
|
y=audio, hop_length=self.hop_length
|
|
103
100
|
)[0]
|
|
104
|
-
|
|
101
|
+
|
|
105
102
|
def spectral_contrast(self, audio: np.ndarray, n_bands: int = 6) -> np.ndarray:
|
|
106
103
|
"""
|
|
107
104
|
计算谱对比度
|
|
108
|
-
|
|
105
|
+
|
|
109
106
|
Args:
|
|
110
107
|
audio: 音频信号
|
|
111
108
|
n_bands: 频段数量
|
|
112
|
-
|
|
109
|
+
|
|
113
110
|
Returns:
|
|
114
111
|
谱对比度矩阵
|
|
115
112
|
"""
|
|
116
113
|
return librosa.feature.spectral_contrast(
|
|
117
114
|
y=audio, sr=self.sr, hop_length=self.hop_length, n_bands=n_bands
|
|
118
115
|
)
|
|
119
|
-
|
|
116
|
+
|
|
120
117
|
def mfcc_features(self, audio: np.ndarray, n_mfcc: int = 13) -> np.ndarray:
|
|
121
118
|
"""
|
|
122
119
|
提取MFCC特征
|
|
123
|
-
|
|
120
|
+
|
|
124
121
|
Args:
|
|
125
122
|
audio: 音频信号
|
|
126
123
|
n_mfcc: MFCC系数数量
|
|
127
|
-
|
|
124
|
+
|
|
128
125
|
Returns:
|
|
129
126
|
MFCC特征矩阵
|
|
130
127
|
"""
|
|
131
128
|
return librosa.feature.mfcc(
|
|
132
129
|
y=audio, sr=self.sr, n_mfcc=n_mfcc, hop_length=self.hop_length
|
|
133
130
|
)
|
|
134
|
-
|
|
131
|
+
|
|
135
132
|
def mel_spectrogram(self, audio: np.ndarray, n_mels: int = 128) -> np.ndarray:
|
|
136
133
|
"""
|
|
137
134
|
计算梅尔频谱图
|
|
138
|
-
|
|
135
|
+
|
|
139
136
|
Args:
|
|
140
137
|
audio: 音频信号
|
|
141
138
|
n_mels: 梅尔滤波器组数量
|
|
142
|
-
|
|
139
|
+
|
|
143
140
|
Returns:
|
|
144
141
|
梅尔频谱图
|
|
145
142
|
"""
|
|
146
143
|
return librosa.feature.melspectrogram(
|
|
147
144
|
y=audio, sr=self.sr, n_mels=n_mels, hop_length=self.hop_length
|
|
148
145
|
)
|
|
149
|
-
|
|
146
|
+
|
|
150
147
|
def chroma_features(self, audio: np.ndarray) -> np.ndarray:
|
|
151
148
|
"""
|
|
152
149
|
提取色度特征
|
|
153
|
-
|
|
150
|
+
|
|
154
151
|
Args:
|
|
155
152
|
audio: 音频信号
|
|
156
|
-
|
|
153
|
+
|
|
157
154
|
Returns:
|
|
158
155
|
色度特征矩阵
|
|
159
156
|
"""
|
|
@@ -165,16 +162,16 @@ class SpectralAnalyzer:
|
|
|
165
162
|
def compute_spectral_features(audio: np.ndarray, sr: int = 22050) -> dict:
|
|
166
163
|
"""
|
|
167
164
|
计算完整的频域特征集合
|
|
168
|
-
|
|
165
|
+
|
|
169
166
|
Args:
|
|
170
167
|
audio: 音频信号
|
|
171
168
|
sr: 采样率
|
|
172
|
-
|
|
169
|
+
|
|
173
170
|
Returns:
|
|
174
171
|
包含各种频域特征的字典
|
|
175
172
|
"""
|
|
176
173
|
analyzer = SpectralAnalyzer(sr=sr)
|
|
177
|
-
|
|
174
|
+
|
|
178
175
|
features = {
|
|
179
176
|
'spectral_centroid': analyzer.spectral_centroid(audio),
|
|
180
177
|
'spectral_rolloff': analyzer.spectral_rolloff(audio),
|
|
@@ -184,27 +181,27 @@ def compute_spectral_features(audio: np.ndarray, sr: int = 22050) -> dict:
|
|
|
184
181
|
'mel_spectrogram': analyzer.mel_spectrogram(audio),
|
|
185
182
|
'chroma': analyzer.chroma_features(audio)
|
|
186
183
|
}
|
|
187
|
-
|
|
184
|
+
|
|
188
185
|
return features
|
|
189
186
|
|
|
190
187
|
|
|
191
188
|
def frequency_domain_stats(audio: np.ndarray, sr: int = 22050) -> dict:
|
|
192
189
|
"""
|
|
193
190
|
计算频域统计信息
|
|
194
|
-
|
|
191
|
+
|
|
195
192
|
Args:
|
|
196
193
|
audio: 音频信号
|
|
197
194
|
sr: 采样率
|
|
198
|
-
|
|
195
|
+
|
|
199
196
|
Returns:
|
|
200
197
|
频域统计信息字典
|
|
201
198
|
"""
|
|
202
199
|
analyzer = SpectralAnalyzer(sr=sr)
|
|
203
200
|
freqs, magnitude = analyzer.compute_magnitude_spectrum(audio)
|
|
204
|
-
|
|
201
|
+
|
|
205
202
|
# 计算功率谱密度
|
|
206
203
|
power = magnitude ** 2
|
|
207
|
-
|
|
204
|
+
|
|
208
205
|
# 计算统计量
|
|
209
206
|
stats = {
|
|
210
207
|
'mean_frequency': np.average(freqs, weights=power),
|
|
@@ -212,7 +209,7 @@ def frequency_domain_stats(audio: np.ndarray, sr: int = 22050) -> dict:
|
|
|
212
209
|
'peak_frequency': freqs[np.argmax(magnitude)],
|
|
213
210
|
'bandwidth': freqs[np.where(power > 0.5 * np.max(power))][-1] - freqs[np.where(power > 0.5 * np.max(power))][0],
|
|
214
211
|
'spectral_energy': np.sum(power),
|
|
215
|
-
'spectral_entropy': -np.sum((power/np.sum(power)) * np.log2(power/np.sum(power) + 1e-10))
|
|
212
|
+
'spectral_entropy': -np.sum((power / np.sum(power)) * np.log2(power / np.sum(power) + 1e-10))
|
|
216
213
|
}
|
|
217
|
-
|
|
218
|
-
return stats
|
|
214
|
+
|
|
215
|
+
return stats
|
|
@@ -4,25 +4,20 @@ Statistics Analysis Module
|
|
|
4
4
|
|
|
5
5
|
提供音频数据集统计分析功能
|
|
6
6
|
"""
|
|
7
|
-
|
|
7
|
+
import json
|
|
8
8
|
import numpy as np
|
|
9
|
-
import librosa
|
|
10
|
-
import os
|
|
11
9
|
from pathlib import Path
|
|
12
|
-
from typing import List, Dict, Tuple, Optional
|
|
13
|
-
|
|
14
|
-
from collections import defaultdict
|
|
15
|
-
import json
|
|
16
|
-
from .utils import rms_amplitude, dB
|
|
10
|
+
from typing import List, Dict, Tuple, Optional
|
|
11
|
+
from .temporal_features import rms_amplitude, dB
|
|
17
12
|
|
|
18
13
|
|
|
19
14
|
class AudioStatistics:
|
|
20
15
|
"""音频统计分析类"""
|
|
21
|
-
|
|
16
|
+
|
|
22
17
|
def __init__(self, sr: int = 22050):
|
|
23
18
|
"""
|
|
24
19
|
初始化统计分析器
|
|
25
|
-
|
|
20
|
+
|
|
26
21
|
Args:
|
|
27
22
|
sr: 采样率
|
|
28
23
|
"""
|
|
@@ -30,53 +25,58 @@ class AudioStatistics:
|
|
|
30
25
|
self.audio_data = []
|
|
31
26
|
self.file_paths = []
|
|
32
27
|
self.statistics = {}
|
|
33
|
-
|
|
28
|
+
|
|
34
29
|
def add_audio_file(self, file_path: str, audio_data: Optional[np.ndarray] = None):
|
|
35
30
|
"""
|
|
36
31
|
添加音频文件到分析列表
|
|
37
|
-
|
|
32
|
+
|
|
38
33
|
Args:
|
|
39
34
|
file_path: 音频文件路径
|
|
40
35
|
audio_data: 音频数据(如果不提供则从文件加载)
|
|
41
36
|
"""
|
|
37
|
+
try:
|
|
38
|
+
import librosa
|
|
39
|
+
except Exception as e:
|
|
40
|
+
raise ImportError("需要安装 librosa 才能使用 add_audio_file: pip install librosa") from e
|
|
41
|
+
|
|
42
42
|
if audio_data is None:
|
|
43
43
|
try:
|
|
44
44
|
audio_data, _ = librosa.load(file_path, sr=self.sr)
|
|
45
45
|
except Exception as e:
|
|
46
46
|
print(f"Error loading {file_path}: {e}")
|
|
47
47
|
return
|
|
48
|
-
|
|
48
|
+
|
|
49
49
|
self.audio_data.append(audio_data)
|
|
50
50
|
self.file_paths.append(file_path)
|
|
51
|
-
|
|
51
|
+
|
|
52
52
|
def add_audio_directory(self, directory: str, extensions: List[str] = None):
|
|
53
53
|
"""
|
|
54
54
|
批量添加目录中的音频文件
|
|
55
|
-
|
|
55
|
+
|
|
56
56
|
Args:
|
|
57
57
|
directory: 音频文件目录
|
|
58
58
|
extensions: 支持的文件扩展名
|
|
59
59
|
"""
|
|
60
60
|
if extensions is None:
|
|
61
61
|
extensions = ['.wav', '.mp3', '.flac', '.m4a', '.aac']
|
|
62
|
-
|
|
62
|
+
|
|
63
63
|
directory = Path(directory)
|
|
64
64
|
for ext in extensions:
|
|
65
65
|
for file_path in directory.glob(f'*{ext}'):
|
|
66
66
|
self.add_audio_file(str(file_path))
|
|
67
|
-
|
|
67
|
+
|
|
68
68
|
def compute_duration_statistics(self) -> Dict:
|
|
69
69
|
"""
|
|
70
70
|
计算音频时长统计
|
|
71
|
-
|
|
71
|
+
|
|
72
72
|
Returns:
|
|
73
73
|
时长统计信息
|
|
74
74
|
"""
|
|
75
75
|
durations = [len(audio) / self.sr for audio in self.audio_data]
|
|
76
|
-
|
|
76
|
+
|
|
77
77
|
if not durations:
|
|
78
78
|
return {}
|
|
79
|
-
|
|
79
|
+
|
|
80
80
|
stats = {
|
|
81
81
|
'count': len(durations),
|
|
82
82
|
'total_duration': sum(durations),
|
|
@@ -92,30 +92,30 @@ class AudioStatistics:
|
|
|
92
92
|
'95th': np.percentile(durations, 95)
|
|
93
93
|
}
|
|
94
94
|
}
|
|
95
|
-
|
|
95
|
+
|
|
96
96
|
return stats
|
|
97
|
-
|
|
97
|
+
|
|
98
98
|
def compute_amplitude_statistics(self) -> Dict:
|
|
99
99
|
"""
|
|
100
100
|
计算幅度统计
|
|
101
|
-
|
|
101
|
+
|
|
102
102
|
Returns:
|
|
103
103
|
幅度统计信息
|
|
104
104
|
"""
|
|
105
105
|
all_amplitudes = []
|
|
106
106
|
max_amplitudes = []
|
|
107
107
|
rms_values = []
|
|
108
|
-
|
|
108
|
+
|
|
109
109
|
for audio in self.audio_data:
|
|
110
110
|
all_amplitudes.extend(np.abs(audio).tolist())
|
|
111
111
|
max_amplitudes.append(np.max(np.abs(audio)))
|
|
112
112
|
rms_values.append(rms_amplitude(audio))
|
|
113
|
-
|
|
113
|
+
|
|
114
114
|
if not all_amplitudes:
|
|
115
115
|
return {}
|
|
116
|
-
|
|
116
|
+
|
|
117
117
|
all_amplitudes = np.array(all_amplitudes)
|
|
118
|
-
|
|
118
|
+
|
|
119
119
|
stats = {
|
|
120
120
|
'overall': {
|
|
121
121
|
'mean': np.mean(all_amplitudes),
|
|
@@ -144,33 +144,33 @@ class AudioStatistics:
|
|
|
144
144
|
'std_db': np.std([dB(rms) for rms in rms_values])
|
|
145
145
|
}
|
|
146
146
|
}
|
|
147
|
-
|
|
147
|
+
|
|
148
148
|
return stats
|
|
149
|
-
|
|
149
|
+
|
|
150
150
|
def compute_frequency_statistics(self) -> Dict:
|
|
151
151
|
"""
|
|
152
152
|
计算频域统计
|
|
153
|
-
|
|
153
|
+
|
|
154
154
|
Returns:
|
|
155
155
|
频域统计信息
|
|
156
156
|
"""
|
|
157
157
|
spectral_centroids = []
|
|
158
158
|
spectral_bandwidths = []
|
|
159
159
|
spectral_rolloffs = []
|
|
160
|
-
|
|
160
|
+
|
|
161
161
|
for audio in self.audio_data:
|
|
162
162
|
# 计算频谱特征
|
|
163
163
|
centroid = librosa.feature.spectral_centroid(y=audio, sr=self.sr)[0]
|
|
164
164
|
bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=self.sr)[0]
|
|
165
165
|
rolloff = librosa.feature.spectral_rolloff(y=audio, sr=self.sr)[0]
|
|
166
|
-
|
|
166
|
+
|
|
167
167
|
spectral_centroids.extend(centroid.tolist())
|
|
168
168
|
spectral_bandwidths.extend(bandwidth.tolist())
|
|
169
169
|
spectral_rolloffs.extend(rolloff.tolist())
|
|
170
|
-
|
|
170
|
+
|
|
171
171
|
if not spectral_centroids:
|
|
172
172
|
return {}
|
|
173
|
-
|
|
173
|
+
|
|
174
174
|
stats = {
|
|
175
175
|
'spectral_centroid': {
|
|
176
176
|
'mean': np.mean(spectral_centroids),
|
|
@@ -191,17 +191,17 @@ class AudioStatistics:
|
|
|
191
191
|
'max': np.max(spectral_rolloffs)
|
|
192
192
|
}
|
|
193
193
|
}
|
|
194
|
-
|
|
194
|
+
|
|
195
195
|
return stats
|
|
196
|
-
|
|
196
|
+
|
|
197
197
|
def detect_outliers(self, feature: str = 'duration', threshold: float = 2.0) -> List[Tuple[str, float]]:
|
|
198
198
|
"""
|
|
199
199
|
检测异常值
|
|
200
|
-
|
|
200
|
+
|
|
201
201
|
Args:
|
|
202
202
|
feature: 要检测的特征 ('duration', 'max_amplitude', 'rms')
|
|
203
203
|
threshold: Z-score阈值
|
|
204
|
-
|
|
204
|
+
|
|
205
205
|
Returns:
|
|
206
206
|
异常文件列表 [(文件路径, 特征值)]
|
|
207
207
|
"""
|
|
@@ -213,23 +213,23 @@ class AudioStatistics:
|
|
|
213
213
|
values = [rms_amplitude(audio) for audio in self.audio_data]
|
|
214
214
|
else:
|
|
215
215
|
raise ValueError(f"Unknown feature: {feature}")
|
|
216
|
-
|
|
216
|
+
|
|
217
217
|
values = np.array(values)
|
|
218
218
|
mean_val = np.mean(values)
|
|
219
219
|
std_val = np.std(values)
|
|
220
|
-
|
|
220
|
+
|
|
221
221
|
outliers = []
|
|
222
222
|
for i, (path, val) in enumerate(zip(self.file_paths, values)):
|
|
223
223
|
z_score = abs(val - mean_val) / (std_val + 1e-10)
|
|
224
224
|
if z_score > threshold:
|
|
225
225
|
outliers.append((path, val))
|
|
226
|
-
|
|
226
|
+
|
|
227
227
|
return outliers
|
|
228
|
-
|
|
228
|
+
|
|
229
229
|
def generate_distribution_analysis(self) -> Dict:
|
|
230
230
|
"""
|
|
231
231
|
生成分布分析
|
|
232
|
-
|
|
232
|
+
|
|
233
233
|
Returns:
|
|
234
234
|
分布分析结果
|
|
235
235
|
"""
|
|
@@ -238,34 +238,34 @@ class AudioStatistics:
|
|
|
238
238
|
'amplitude_distribution': self._analyze_distribution([np.max(np.abs(audio)) for audio in self.audio_data]),
|
|
239
239
|
'rms_distribution': self._analyze_distribution([rms_amplitude(audio) for audio in self.audio_data])
|
|
240
240
|
}
|
|
241
|
-
|
|
241
|
+
|
|
242
242
|
return analysis
|
|
243
|
-
|
|
243
|
+
|
|
244
244
|
def _analyze_distribution(self, values: List[float]) -> Dict:
|
|
245
245
|
"""
|
|
246
246
|
分析数值分布
|
|
247
|
-
|
|
247
|
+
|
|
248
248
|
Args:
|
|
249
249
|
values: 数值列表
|
|
250
|
-
|
|
250
|
+
|
|
251
251
|
Returns:
|
|
252
252
|
分布分析结果
|
|
253
253
|
"""
|
|
254
254
|
if not values:
|
|
255
255
|
return {}
|
|
256
|
-
|
|
256
|
+
|
|
257
257
|
values = np.array(values)
|
|
258
|
-
|
|
258
|
+
|
|
259
259
|
# 计算偏度和峰度
|
|
260
260
|
mean_val = np.mean(values)
|
|
261
261
|
std_val = np.std(values)
|
|
262
|
-
|
|
262
|
+
|
|
263
263
|
# 偏度 (skewness)
|
|
264
264
|
skewness = np.mean(((values - mean_val) / (std_val + 1e-10)) ** 3)
|
|
265
|
-
|
|
265
|
+
|
|
266
266
|
# 峰度 (kurtosis)
|
|
267
267
|
kurtosis = np.mean(((values - mean_val) / (std_val + 1e-10)) ** 4) - 3
|
|
268
|
-
|
|
268
|
+
|
|
269
269
|
return {
|
|
270
270
|
'mean': mean_val,
|
|
271
271
|
'std': std_val,
|
|
@@ -273,15 +273,15 @@ class AudioStatistics:
|
|
|
273
273
|
'kurtosis': kurtosis,
|
|
274
274
|
'distribution_type': self._classify_distribution(skewness, kurtosis)
|
|
275
275
|
}
|
|
276
|
-
|
|
276
|
+
|
|
277
277
|
def _classify_distribution(self, skewness: float, kurtosis: float) -> str:
|
|
278
278
|
"""
|
|
279
279
|
分类分布类型
|
|
280
|
-
|
|
280
|
+
|
|
281
281
|
Args:
|
|
282
282
|
skewness: 偏度
|
|
283
283
|
kurtosis: 峰度
|
|
284
|
-
|
|
284
|
+
|
|
285
285
|
Returns:
|
|
286
286
|
分布类型描述
|
|
287
287
|
"""
|
|
@@ -297,11 +297,11 @@ class AudioStatistics:
|
|
|
297
297
|
return "light_tailed"
|
|
298
298
|
else:
|
|
299
299
|
return "unknown"
|
|
300
|
-
|
|
300
|
+
|
|
301
301
|
def compute_all_statistics(self) -> Dict:
|
|
302
302
|
"""
|
|
303
303
|
计算所有统计信息
|
|
304
|
-
|
|
304
|
+
|
|
305
305
|
Returns:
|
|
306
306
|
完整统计报告
|
|
307
307
|
"""
|
|
@@ -318,13 +318,13 @@ class AudioStatistics:
|
|
|
318
318
|
'rms': self.detect_outliers('rms')
|
|
319
319
|
}
|
|
320
320
|
}
|
|
321
|
-
|
|
321
|
+
|
|
322
322
|
return self.statistics
|
|
323
|
-
|
|
323
|
+
|
|
324
324
|
def export_statistics(self, output_path: str):
|
|
325
325
|
"""
|
|
326
326
|
导出统计结果到JSON文件
|
|
327
|
-
|
|
327
|
+
|
|
328
328
|
Args:
|
|
329
329
|
output_path: 输出文件路径
|
|
330
330
|
"""
|
|
@@ -342,9 +342,9 @@ class AudioStatistics:
|
|
|
342
342
|
return [convert_numpy(item) for item in obj]
|
|
343
343
|
else:
|
|
344
344
|
return obj
|
|
345
|
-
|
|
345
|
+
|
|
346
346
|
stats_json = convert_numpy(self.statistics)
|
|
347
|
-
|
|
347
|
+
|
|
348
348
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
349
349
|
json.dump(stats_json, f, indent=2, ensure_ascii=False)
|
|
350
350
|
|
|
@@ -352,47 +352,47 @@ class AudioStatistics:
|
|
|
352
352
|
def quick_audio_stats(file_paths: List[str], sr: int = 22050) -> Dict:
|
|
353
353
|
"""
|
|
354
354
|
快速音频统计分析
|
|
355
|
-
|
|
355
|
+
|
|
356
356
|
Args:
|
|
357
357
|
file_paths: 音频文件路径列表
|
|
358
358
|
sr: 采样率
|
|
359
|
-
|
|
359
|
+
|
|
360
360
|
Returns:
|
|
361
361
|
统计结果
|
|
362
362
|
"""
|
|
363
363
|
analyzer = AudioStatistics(sr=sr)
|
|
364
|
-
|
|
364
|
+
|
|
365
365
|
for file_path in file_paths:
|
|
366
366
|
analyzer.add_audio_file(file_path)
|
|
367
|
-
|
|
367
|
+
|
|
368
368
|
return analyzer.compute_all_statistics()
|
|
369
369
|
|
|
370
370
|
|
|
371
|
-
def compare_datasets(dataset1_paths: List[str], dataset2_paths: List[str],
|
|
372
|
-
|
|
371
|
+
def compare_datasets(dataset1_paths: List[str], dataset2_paths: List[str],
|
|
372
|
+
sr: int = 22050) -> Dict:
|
|
373
373
|
"""
|
|
374
374
|
比较两个数据集
|
|
375
|
-
|
|
375
|
+
|
|
376
376
|
Args:
|
|
377
377
|
dataset1_paths: 数据集1文件路径
|
|
378
378
|
dataset2_paths: 数据集2文件路径
|
|
379
379
|
sr: 采样率
|
|
380
|
-
|
|
380
|
+
|
|
381
381
|
Returns:
|
|
382
382
|
比较结果
|
|
383
383
|
"""
|
|
384
384
|
analyzer1 = AudioStatistics(sr=sr)
|
|
385
385
|
analyzer2 = AudioStatistics(sr=sr)
|
|
386
|
-
|
|
386
|
+
|
|
387
387
|
for path in dataset1_paths:
|
|
388
388
|
analyzer1.add_audio_file(path)
|
|
389
|
-
|
|
389
|
+
|
|
390
390
|
for path in dataset2_paths:
|
|
391
391
|
analyzer2.add_audio_file(path)
|
|
392
|
-
|
|
392
|
+
|
|
393
393
|
stats1 = analyzer1.compute_all_statistics()
|
|
394
394
|
stats2 = analyzer2.compute_all_statistics()
|
|
395
|
-
|
|
395
|
+
|
|
396
396
|
comparison = {
|
|
397
397
|
'dataset1': stats1,
|
|
398
398
|
'dataset2': stats2,
|
|
@@ -402,5 +402,5 @@ def compare_datasets(dataset1_paths: List[str], dataset2_paths: List[str],
|
|
|
402
402
|
'mean_rms_diff': stats2['amplitude_stats']['rms_values']['mean'] - stats1['amplitude_stats']['rms_values']['mean']
|
|
403
403
|
}
|
|
404
404
|
}
|
|
405
|
-
|
|
406
|
-
return comparison
|
|
405
|
+
|
|
406
|
+
return comparison
|
|
@@ -7,13 +7,7 @@ Temporal Features Analysis Module
|
|
|
7
7
|
|
|
8
8
|
提供音频时域特征提取和分析功能
|
|
9
9
|
'''
|
|
10
|
-
|
|
11
|
-
import warnings
|
|
12
|
-
from typing import Tuple, Optional, Union
|
|
13
|
-
from scipy import signal
|
|
14
10
|
import numpy as np
|
|
15
|
-
import librosa
|
|
16
|
-
# from neverlib.utils.utils import dB
|
|
17
11
|
|
|
18
12
|
|
|
19
13
|
def dB(level):
|
|
@@ -42,6 +36,11 @@ def rms_amplitude(wav, frame_length=512, hop_length=256):
|
|
|
42
36
|
:param wav: (*, ch)
|
|
43
37
|
:return: (frame_num,)
|
|
44
38
|
"""
|
|
39
|
+
try:
|
|
40
|
+
import librosa
|
|
41
|
+
except Exception as e:
|
|
42
|
+
raise ImportError("需要安装 librosa 才能使用 rms_amplitude: pip install librosa") from e
|
|
43
|
+
|
|
45
44
|
# 分帧
|
|
46
45
|
frame = librosa.util.frame(wav.flatten(), frame_length=frame_length, hop_length=hop_length) # (frame_length, frame_num)
|
|
47
46
|
rms_amp = np.sqrt(np.mean(frame**2, axis=0)) # (frame_num,)
|
|
@@ -82,6 +81,11 @@ def zero_crossing_rate(self, audio: np.ndarray) -> np.ndarray:
|
|
|
82
81
|
Returns:
|
|
83
82
|
过零率数组
|
|
84
83
|
"""
|
|
84
|
+
try:
|
|
85
|
+
import librosa
|
|
86
|
+
except Exception as e:
|
|
87
|
+
raise ImportError("需要安装 librosa 才能使用 zero_crossing_rate: pip install librosa") from e
|
|
88
|
+
|
|
85
89
|
return librosa.feature.zero_crossing_rate(
|
|
86
90
|
audio, frame_length=self.frame_length, hop_length=self.hop_length
|
|
87
91
|
)[0]
|
|
@@ -97,6 +101,11 @@ def short_time_energy(self, audio: np.ndarray) -> np.ndarray:
|
|
|
97
101
|
Returns:
|
|
98
102
|
短时能量数组
|
|
99
103
|
"""
|
|
104
|
+
try:
|
|
105
|
+
import librosa
|
|
106
|
+
except Exception as e:
|
|
107
|
+
raise ImportError("需要安装 librosa 才能使用 short_time_energy: pip install librosa") from e
|
|
108
|
+
|
|
100
109
|
# 分帧
|
|
101
110
|
frames = librosa.util.frame(
|
|
102
111
|
audio, frame_length=self.frame_length, hop_length=self.hop_length
|