neverlib 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- neverlib/.history/Docs/audio_aug/test_snr_20250806011311.py +0 -0
- neverlib/.history/Docs/audio_aug/test_snr_20250806011331.py +75 -0
- neverlib/.history/Docs/audio_aug/test_snr_20250806011342.py +57 -0
- neverlib/.history/Docs/audio_aug/test_snr_20250806011352.py +57 -0
- neverlib/.history/Docs/audio_aug/test_snr_20250806011403.py +57 -0
- neverlib/.history/Docs/audio_aug/test_snr_20250806011413.py +57 -0
- neverlib/.history/Docs/audio_aug/test_snr_20250806011435.py +55 -0
- neverlib/.history/Docs/vad/1_20250810032405.py +0 -0
- neverlib/.history/Docs/vad/1_20250810032417.py +39 -0
- neverlib/.history/audio_aug/audio_aug_20250806010451.py +125 -0
- neverlib/.history/audio_aug/audio_aug_20250806010750.py +138 -0
- neverlib/.history/audio_aug/audio_aug_20250806010759.py +140 -0
- neverlib/.history/audio_aug/audio_aug_20250806010803.py +140 -0
- neverlib/.history/audio_aug/audio_aug_20250806010809.py +140 -0
- neverlib/.history/audio_aug/audio_aug_20250806011108.py +140 -0
- neverlib/.history/dataAnalyze/__init___20250805234204.py +87 -0
- neverlib/.history/dataAnalyze/__init___20250806204125.py +14 -0
- neverlib/.history/dataAnalyze/__init___20250806204139.py +14 -0
- neverlib/.history/dataAnalyze/__init___20250806204159.py +14 -0
- neverlib/.history/filter/__init___20250820103351.py +70 -0
- neverlib/.history/filter/__init___20250821102348.py +70 -0
- neverlib/.history/filter/__init___20250821102405.py +14 -0
- neverlib/.history/filter/auto_eq/__init___20250819213121.py +36 -0
- neverlib/.history/filter/auto_eq/__init___20250821102241.py +36 -0
- neverlib/.history/filter/auto_eq/__init___20250821102259.py +36 -0
- neverlib/.history/filter/auto_eq/__init___20250821102307.py +36 -0
- neverlib/.history/filter/auto_eq/__init___20250821102310.py +36 -0
- neverlib/.history/filter/auto_eq/__init___20250821102318.py +36 -0
- neverlib/.history/filter/auto_eq/__init___20250821102507.py +36 -0
- neverlib/.history/filter/auto_eq/de_eq_20250820103848.py +361 -0
- neverlib/.history/filter/auto_eq/de_eq_20250821102422.py +360 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250805234206.py +75 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820140732.py +75 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820140745.py +75 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820140816.py +75 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820140938.py +77 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141003.py +77 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141006.py +77 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141019.py +77 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141049.py +77 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141211.py +77 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141227.py +77 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141311.py +78 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141340.py +78 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141712.py +78 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141733.py +78 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141755.py +78 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250821102434.py +76 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250821102500.py +76 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250821102502.py +76 -0
- neverlib/.history/filter/auto_eq/ga_eq_basic_20250820102957.py +380 -0
- neverlib/.history/filter/auto_eq/ga_eq_basic_20250820113054.py +380 -0
- neverlib/.history/filter/auto_eq/ga_eq_basic_20250820113150.py +380 -0
- neverlib/.history/filter/auto_eq/ga_eq_basic_20250820113520.py +385 -0
- neverlib/.history/filter/auto_eq/ga_eq_basic_20250820113525.py +385 -0
- neverlib/.history/filter/auto_eq/ga_eq_basic_20250821102212.py +385 -0
- neverlib/.history/metrics/dnsmos_20250806001612.py +160 -0
- neverlib/.history/metrics/dnsmos_20250815180659.py +160 -0
- neverlib/.history/metrics/dnsmos_20250815180701.py +158 -0
- neverlib/.history/metrics/dnsmos_20250815181321.py +154 -0
- neverlib/.history/metrics/dnsmos_20250815181327.py +154 -0
- neverlib/.history/metrics/dnsmos_20250815181331.py +154 -0
- neverlib/.history/metrics/dnsmos_20250815181620.py +154 -0
- neverlib/.history/metrics/dnsmos_20250815181631.py +154 -0
- neverlib/.history/metrics/dnsmos_20250815181742.py +154 -0
- neverlib/.history/metrics/dnsmos_20250815181824.py +153 -0
- neverlib/.history/metrics/dnsmos_20250815181834.py +153 -0
- neverlib/.history/metrics/dnsmos_20250815181922.py +153 -0
- neverlib/.history/metrics/dnsmos_20250815182011.py +147 -0
- neverlib/.history/metrics/dnsmos_20250815182036.py +144 -0
- neverlib/.history/metrics/dnsmos_20250815182936.py +143 -0
- neverlib/.history/metrics/dnsmos_20250815182942.py +143 -0
- neverlib/.history/metrics/dnsmos_20250815183032.py +137 -0
- neverlib/.history/metrics/dnsmos_20250815183101.py +144 -0
- neverlib/.history/metrics/dnsmos_20250815183121.py +144 -0
- neverlib/.history/metrics/dnsmos_20250815183123.py +143 -0
- neverlib/.history/metrics/dnsmos_20250815183214.py +143 -0
- neverlib/.history/metrics/dnsmos_20250815183240.py +143 -0
- neverlib/.history/metrics/dnsmos_20250815183248.py +144 -0
- neverlib/.history/metrics/dnsmos_20250815183407.py +142 -0
- neverlib/.history/metrics/dnsmos_20250815183409.py +142 -0
- neverlib/.history/metrics/dnsmos_20250815183431.py +142 -0
- neverlib/.history/metrics/dnsmos_20250815183507.py +140 -0
- neverlib/.history/metrics/dnsmos_20250815183513.py +139 -0
- neverlib/.history/metrics/dnsmos_20250815183618.py +139 -0
- neverlib/.history/metrics/dnsmos_20250815183709.py +140 -0
- neverlib/.history/metrics/dnsmos_20250815183756.py +137 -0
- neverlib/.history/metrics/dnsmos_20250815183815.py +128 -0
- neverlib/.history/metrics/dnsmos_20250815183827.py +129 -0
- neverlib/.history/metrics/dnsmos_20250815183913.py +117 -0
- neverlib/.history/metrics/dnsmos_20250815183914.py +117 -0
- neverlib/.history/metrics/dnsmos_20250815184003.py +118 -0
- neverlib/.history/metrics/dnsmos_20250815184040.py +118 -0
- neverlib/.history/metrics/dnsmos_20250815184049.py +118 -0
- neverlib/.history/metrics/dnsmos_20250815184104.py +117 -0
- neverlib/.history/metrics/dnsmos_20250815184200.py +117 -0
- neverlib/.history/metrics/lpc_lsp_metric_20250816015944.py +128 -0
- neverlib/.history/metrics/lpc_lsp_metric_20250816020142.py +128 -0
- neverlib/.history/metrics/lpc_lsp_metric_20250816020156.py +128 -0
- neverlib/.history/metrics/lpc_lsp_metric_20250816020554.py +130 -0
- neverlib/.history/metrics/lpc_lsp_metric_20250816020600.py +125 -0
- neverlib/.history/metrics/lpc_lsp_metric_20250816020631.py +120 -0
- neverlib/.history/metrics/lpc_lsp_metric_20250816020746.py +118 -0
- neverlib/.history/metrics/lpc_me_20250816013111.py +0 -0
- neverlib/.history/metrics/lpc_me_20250816013129.py +121 -0
- neverlib/.history/metrics/lpc_me_20250816015430.py +103 -0
- neverlib/.history/metrics/lpc_me_20250816015535.py +96 -0
- neverlib/.history/metrics/lpc_me_20250816015542.py +96 -0
- neverlib/.history/metrics/lpc_me_20250816015636.py +97 -0
- neverlib/.history/metrics/lpc_me_20250816015658.py +104 -0
- neverlib/.history/metrics/lpc_me_20250816015703.py +100 -0
- neverlib/.history/metrics/lpc_me_20250816015945.py +128 -0
- neverlib/.history/metrics/snr_20250806010538.py +177 -0
- neverlib/.history/metrics/snr_20250806211634.py +184 -0
- neverlib/.history/metrics/spec_20250805234209.py +45 -0
- neverlib/.history/metrics/spec_20250816135530.py +11 -0
- neverlib/.history/metrics/spec_20250816135654.py +16 -0
- neverlib/.history/metrics/spec_20250816135736.py +68 -0
- neverlib/.history/metrics/spec_20250816135904.py +75 -0
- neverlib/.history/metrics/spec_20250816135921.py +82 -0
- neverlib/.history/metrics/spec_20250816140111.py +82 -0
- neverlib/.history/metrics/spec_20250816140543.py +136 -0
- neverlib/.history/metrics/spec_20250816140559.py +172 -0
- neverlib/.history/metrics/spec_20250816140602.py +172 -0
- neverlib/.history/metrics/spec_20250816140608.py +172 -0
- neverlib/.history/metrics/spec_20250816140654.py +148 -0
- neverlib/.history/metrics/spec_20250816140705.py +144 -0
- neverlib/.history/metrics/spec_20250816140755.py +138 -0
- neverlib/.history/metrics/spec_20250816140823.py +170 -0
- neverlib/.history/metrics/spec_20250816140832.py +170 -0
- neverlib/.history/metrics/spec_20250816140833.py +170 -0
- neverlib/.history/metrics/spec_20250816140922.py +147 -0
- neverlib/.history/metrics/spec_20250816141148.py +107 -0
- neverlib/.history/metrics/spec_20250816141219.py +123 -0
- neverlib/.history/metrics/spec_20250816141732.py +178 -0
- neverlib/.history/metrics/spec_20250816141740.py +178 -0
- neverlib/.history/metrics/spec_20250816142030.py +178 -0
- neverlib/.history/metrics/spec_20250816142107.py +135 -0
- neverlib/.history/metrics/spec_20250816142126.py +135 -0
- neverlib/.history/metrics/spec_20250816142410.py +135 -0
- neverlib/.history/metrics/spec_20250816142415.py +136 -0
- neverlib/.history/metrics/spec_metric_20250816135156.py +0 -0
- neverlib/.history/metrics/spec_metric_20250816135226.py +5 -0
- neverlib/.history/metrics/spec_metric_20250816135227.py +10 -0
- neverlib/.history/metrics/spec_metric_20250816135306.py +15 -0
- neverlib/.history/metrics/spec_metric_20250816135442.py +31 -0
- neverlib/.history/metrics/spec_metric_20250816135448.py +31 -0
- neverlib/.history/metrics/spec_metric_20250816135520.py +29 -0
- neverlib/.history/metrics/spec_metric_20250816135537.py +63 -0
- neverlib/.history/metrics/spec_metric_20250816135653.py +65 -0
- neverlib/.history/vad/PreProcess_20250805234211.py +63 -0
- neverlib/.history/vad/PreProcess_20250809232455.py +63 -0
- neverlib/.history/vad/PreProcess_20250816020725.py +66 -0
- neverlib/.history/vad/VAD_Silero_20250805234211.py +50 -0
- neverlib/.history/vad/VAD_Silero_20250809232456.py +50 -0
- neverlib/.history/vad/VAD_WebRTC_20250805234211.py +61 -0
- neverlib/.history/vad/VAD_WebRTC_20250809232456.py +61 -0
- neverlib/.history/vad/VAD_funasr_20250805234211.py +54 -0
- neverlib/.history/vad/VAD_funasr_20250809232456.py +54 -0
- neverlib/.history/vad/VAD_vadlib_20250805234211.py +70 -0
- neverlib/.history/vad/VAD_vadlib_20250809232455.py +70 -0
- neverlib/.history/vad/VAD_whisper_20250805234211.py +55 -0
- neverlib/.history/vad/VAD_whisper_20250809232456.py +55 -0
- neverlib/.specstory/.what-is-this.md +69 -0
- neverlib/.specstory/history/2025-08-05_17-06Z-/350/277/231/344/270/200/346/255/245/347/232/204/347/233/256/347/232/204/346/230/257/344/273/200/344/271/210.md +424 -0
- neverlib/Docs/audio_aug/test_snr.py +55 -0
- neverlib/__init__.py +2 -2
- neverlib/audio_aug/HarmonicDistortion.py +79 -0
- neverlib/audio_aug/TFDrop.py +41 -0
- neverlib/audio_aug/TFMask.py +56 -0
- neverlib/audio_aug/__init__.py +1 -1
- neverlib/audio_aug/audio_aug.py +19 -5
- neverlib/audio_aug/clip_aug.py +41 -0
- neverlib/audio_aug/coder_aug.py +209 -0
- neverlib/audio_aug/coder_aug2.py +118 -0
- neverlib/audio_aug/loss_packet_aug.py +103 -0
- neverlib/audio_aug/quant_aug.py +78 -0
- neverlib/data_analyze/README.md +234 -0
- neverlib/data_analyze/__init__.py +14 -0
- neverlib/data_analyze/dataset_analyzer.py +590 -0
- neverlib/data_analyze/quality_metrics.py +364 -0
- neverlib/data_analyze/rms_distrubution.py +62 -0
- neverlib/data_analyze/spectral_analysis.py +218 -0
- neverlib/data_analyze/statistics.py +406 -0
- neverlib/data_analyze/temporal_features.py +126 -0
- neverlib/data_analyze/visualization.py +468 -0
- neverlib/filter/README.md +101 -0
- neverlib/filter/__init__.py +7 -0
- neverlib/filter/auto_eq/README.md +165 -0
- neverlib/filter/auto_eq/__init__.py +36 -0
- neverlib/filter/auto_eq/de_eq.py +360 -0
- neverlib/filter/auto_eq/freq_eq.py +76 -0
- neverlib/filter/auto_eq/ga_eq_advanced.py +577 -0
- neverlib/filter/auto_eq/ga_eq_basic.py +385 -0
- neverlib/filter/biquad.py +45 -0
- neverlib/filter/common.py +5 -6
- neverlib/filter/core.py +339 -0
- neverlib/metrics/dnsmos.py +117 -0
- neverlib/metrics/lpc_lsp.py +118 -0
- neverlib/metrics/snr.py +184 -0
- neverlib/metrics/spec.py +136 -0
- neverlib/metrics/test_pesq.py +35 -0
- neverlib/metrics/time.py +68 -0
- neverlib/tests/test_vad.py +21 -0
- neverlib/utils/audio_split.py +2 -1
- neverlib/utils/message.py +4 -4
- neverlib/utils/utils.py +36 -16
- neverlib/vad/PreProcess.py +6 -3
- neverlib/vad/README.md +10 -10
- neverlib/vad/VAD_Energy.py +1 -1
- neverlib/vad/VAD_Silero.py +2 -2
- neverlib/vad/VAD_WebRTC.py +2 -2
- neverlib/vad/VAD_funasr.py +2 -2
- neverlib/vad/VAD_statistics.py +3 -3
- neverlib/vad/VAD_vadlib.py +3 -3
- neverlib/vad/VAD_whisper.py +2 -2
- neverlib/vad/__init__.py +1 -1
- neverlib/vad/class_get_speech.py +4 -4
- neverlib/vad/class_vad.py +1 -1
- neverlib/vad/utils.py +47 -5
- {neverlib-0.2.2.dist-info → neverlib-0.2.4.dist-info}/METADATA +120 -120
- neverlib-0.2.4.dist-info/RECORD +229 -0
- {neverlib-0.2.2.dist-info → neverlib-0.2.4.dist-info}/WHEEL +1 -1
- neverlib/Documents/vad/VAD_Energy.ipynb +0 -159
- neverlib/Documents/vad/VAD_Silero.ipynb +0 -305
- neverlib/Documents/vad/VAD_WebRTC.ipynb +0 -183
- neverlib/Documents/vad/VAD_funasr.ipynb +0 -179
- neverlib/Documents/vad/VAD_ppasr.ipynb +0 -175
- neverlib/Documents/vad/VAD_statistics.ipynb +0 -522
- neverlib/Documents/vad/VAD_vadlib.ipynb +0 -184
- neverlib/Documents/vad/VAD_whisper.ipynb +0 -430
- neverlib/utils/waveform_analyzer.py +0 -51
- neverlib/wav_data/000_short.wav +0 -0
- neverlib-0.2.2.dist-info/RECORD +0 -40
- {neverlib-0.2.2.dist-info → neverlib-0.2.4.dist-info}/licenses/LICENSE +0 -0
- {neverlib-0.2.2.dist-info → neverlib-0.2.4.dist-info}/top_level.txt +0 -0
neverlib/metrics/snr.py
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
sys.path.append("../")
|
|
3
|
+
import librosa
|
|
4
|
+
import numpy as np
|
|
5
|
+
from neverlib.vad.utils import vad2nad
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_snr(speech, noise):
|
|
9
|
+
"""计算信噪比
|
|
10
|
+
Args:
|
|
11
|
+
speech: 语音音频
|
|
12
|
+
noise: 噪声音频
|
|
13
|
+
Returns:
|
|
14
|
+
snr: 信噪比
|
|
15
|
+
"""
|
|
16
|
+
assert speech.ndim == noise.ndim, "speech和noise的维度不一样"
|
|
17
|
+
|
|
18
|
+
power_speech = np.mean(speech**2)
|
|
19
|
+
power_noise = max(np.mean(noise**2), 1e-10)
|
|
20
|
+
|
|
21
|
+
snr = 10 * np.log10(power_speech / power_noise)
|
|
22
|
+
return snr
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_snr_from_noisy(noisy, speech_vad=None):
|
|
26
|
+
"""根据带噪音频计算信噪比
|
|
27
|
+
Args:
|
|
28
|
+
noisy: 带噪音频
|
|
29
|
+
speech_vad: [{start:xxx, end:xxx}, ...]
|
|
30
|
+
Returns:
|
|
31
|
+
snr: 信噪比
|
|
32
|
+
"""
|
|
33
|
+
assert speech_vad is not None, "speech_vad不能为空"
|
|
34
|
+
|
|
35
|
+
# 提取语音段
|
|
36
|
+
speech_segments = []
|
|
37
|
+
for segment in speech_vad:
|
|
38
|
+
start = segment['start']
|
|
39
|
+
end = segment['end']
|
|
40
|
+
speech_segments.append(noisy[start:end])
|
|
41
|
+
speech = np.concatenate(speech_segments, axis=0)
|
|
42
|
+
|
|
43
|
+
# 提取非语音段
|
|
44
|
+
noise_segments = []
|
|
45
|
+
noise_point_list = vad2nad(speech_vad, len(noisy))
|
|
46
|
+
for noise_point in noise_point_list:
|
|
47
|
+
noise_segments.append(noisy[noise_point['start']:noise_point['end']])
|
|
48
|
+
noise = np.concatenate(noise_segments, axis=0)
|
|
49
|
+
|
|
50
|
+
P_speech_noise = np.mean(speech ** 2) # 语音+噪声功率
|
|
51
|
+
P_noise = max(np.mean(noise ** 2), EPS) # 纯噪声功率
|
|
52
|
+
|
|
53
|
+
# 计算净语音功率
|
|
54
|
+
P_speech = max(P_speech_noise - P_noise, EPS)
|
|
55
|
+
snr = 10 * np.log10(P_speech / P_noise)
|
|
56
|
+
|
|
57
|
+
return snr
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def seg_snr(clean, noisy, frame_length: int, hop_length: int):
|
|
61
|
+
"""
|
|
62
|
+
分帧计算信噪比
|
|
63
|
+
Args:
|
|
64
|
+
clean: 干净音频, numpy array
|
|
65
|
+
noisy: 带噪音频, numpy array
|
|
66
|
+
frame_length: 帧长
|
|
67
|
+
hop_length: 帧移
|
|
68
|
+
Returns:
|
|
69
|
+
snr_mean: 平均信噪比, float
|
|
70
|
+
Raises:
|
|
71
|
+
ValueError: 当输入参数不合法时抛出
|
|
72
|
+
"""
|
|
73
|
+
assert clean.shape == noisy.shape, "clean和noisy的维度不一样"
|
|
74
|
+
|
|
75
|
+
# 分帧
|
|
76
|
+
clean_frames = librosa.util.frame(clean, frame_length=frame_length, hop_length=hop_length) # (frame_length, n_frames)
|
|
77
|
+
noisy_frames = librosa.util.frame(noisy, frame_length=frame_length, hop_length=hop_length) # (frame_length, n_frames)
|
|
78
|
+
|
|
79
|
+
# 计算每帧的信噪比
|
|
80
|
+
snr_frames = []
|
|
81
|
+
for i in range(clean_frames.shape[1]):
|
|
82
|
+
clean_frame = clean_frames[:, i]
|
|
83
|
+
noisy_frame = noisy_frames[:, i]
|
|
84
|
+
# 跳过静音帧
|
|
85
|
+
if np.all(np.abs(clean_frame) < 1e-6) or np.all(np.abs(noisy_frame) < 1e-6):
|
|
86
|
+
continue
|
|
87
|
+
snr_frames.append(get_snr(clean_frame, noisy_frame))
|
|
88
|
+
|
|
89
|
+
# 如果所有帧都是静音
|
|
90
|
+
if not snr_frames:
|
|
91
|
+
return float('-inf')
|
|
92
|
+
|
|
93
|
+
return np.mean(snr_frames)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def psnr(clean, noisy, max_val=None):
|
|
97
|
+
"""
|
|
98
|
+
计算峰值信噪比
|
|
99
|
+
Args:
|
|
100
|
+
clean: 干净音频, numpy array
|
|
101
|
+
noisy: 带噪音频, numpy array
|
|
102
|
+
max_val: 信号最大值, 如果为None则使用clean信号的实际最大值
|
|
103
|
+
Returns:
|
|
104
|
+
psnr: 峰值信噪比, 单位dB
|
|
105
|
+
"""
|
|
106
|
+
assert clean.shape == noisy.shape, "clean和noisy的维度不一样"
|
|
107
|
+
|
|
108
|
+
# 如果没有指定最大值, 使用clean信号的实际最大值
|
|
109
|
+
if max_val is None:
|
|
110
|
+
max_val = np.abs(clean).max()
|
|
111
|
+
|
|
112
|
+
# 计算均方误差 (MSE)
|
|
113
|
+
mse = np.mean((clean - noisy) ** 2)
|
|
114
|
+
|
|
115
|
+
# 避免除以0
|
|
116
|
+
if mse == 0:
|
|
117
|
+
return float('inf')
|
|
118
|
+
|
|
119
|
+
# 计算PSNR
|
|
120
|
+
psnr = 10 * np.log10(max_val**2 / mse)
|
|
121
|
+
return psnr
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def si_sdr(reference, estimate, epsilon=1e-8):
|
|
125
|
+
"""
|
|
126
|
+
计算尺度不变信噪比 (Scale-Invariant Signal-to-Distortion Ratio, SI-SDR)。
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
reference (np.ndarray): 原始的、干净的参考信号 (一维数组)。
|
|
130
|
+
estimate (np.ndarray): 模型估计或处理后的信号 (一维数组)。
|
|
131
|
+
epsilon (float): 一个非常小的数值, 用于防止分母为零, 保证数值稳定性。
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
float: SI-SDR 值, 单位为分贝 (dB)。
|
|
135
|
+
"""
|
|
136
|
+
assert reference.shape == estimate.shape, "reference和estimate的维度不一样"
|
|
137
|
+
|
|
138
|
+
# 2. 零均值化 (可选但推荐)
|
|
139
|
+
# 移除直流分量, 使计算更关注信号的动态变化
|
|
140
|
+
reference = reference - np.mean(reference)
|
|
141
|
+
estimate = estimate - np.mean(estimate)
|
|
142
|
+
|
|
143
|
+
# 3. 计算目标信号分量 (s_target)
|
|
144
|
+
# s_target 是 estimate 在 reference 上的投影
|
|
145
|
+
# 公式: s_target = (<ŝ, s> / ||s||²) * s
|
|
146
|
+
dot_product = np.dot(estimate, reference) # <ŝ, s> (点积)
|
|
147
|
+
norm_s_squared = np.dot(reference, reference) # ||s||² (s的能量)
|
|
148
|
+
|
|
149
|
+
# 检查参考信号能量, 避免除以零
|
|
150
|
+
if norm_s_squared < epsilon:
|
|
151
|
+
# 如果参考信号几乎是静音, SI-SDR没有意义
|
|
152
|
+
return -np.inf # 返回负无穷或np.nan
|
|
153
|
+
|
|
154
|
+
alpha = dot_product / (norm_s_squared + epsilon) # 最佳缩放因子 α
|
|
155
|
+
s_target = alpha * reference
|
|
156
|
+
|
|
157
|
+
# 4. 计算误差/失真分量 (e_noise)
|
|
158
|
+
e_noise = estimate - s_target
|
|
159
|
+
|
|
160
|
+
# 5. 计算 SI-SDR
|
|
161
|
+
# SI-SDR = 10 * log10 ( ||s_target||² / ||e_noise||² )
|
|
162
|
+
power_s_target = np.sum(s_target**2) # ||s_target||²
|
|
163
|
+
power_e_noise = np.sum(e_noise**2) # ||e_noise||²
|
|
164
|
+
|
|
165
|
+
# 同样加上 epsilon 防止除以零
|
|
166
|
+
if power_e_noise < epsilon:
|
|
167
|
+
# 如果噪声能量极小, 说明匹配得非常好
|
|
168
|
+
return np.inf # 返回正无穷
|
|
169
|
+
|
|
170
|
+
si_sdr_val = 10 * np.log10(power_s_target / (power_e_noise + epsilon))
|
|
171
|
+
|
|
172
|
+
return si_sdr_val
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
if __name__ == "__main__":
|
|
176
|
+
# 生成测试信号
|
|
177
|
+
speech = np.random.randn(1000)
|
|
178
|
+
noise = np.random.randn(1000) * 0.1 # 较小的噪声
|
|
179
|
+
noisy = speech + noise
|
|
180
|
+
|
|
181
|
+
# 测试各种信噪比计算方法
|
|
182
|
+
print(f"SNR: {get_snr(speech, noise):.2f} dB")
|
|
183
|
+
print(f"Segmental SNR: {seg_snr(speech, noisy, 100, 50):.2f} dB")
|
|
184
|
+
print(f"PSNR: {psnr(speech, noisy):.2f} dB")
|
neverlib/metrics/spec.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Author: 凌逆战 | Never
|
|
3
|
+
Date: 2025-08-16 13:51:57
|
|
4
|
+
Description: 音频信号频域客观度量指标计算工具
|
|
5
|
+
主要功能:
|
|
6
|
+
1. SD (Spectral Distance) - 频谱距离
|
|
7
|
+
- 计算两个音频信号在频域上的差异程度
|
|
8
|
+
- 适用于音频质量评估和信号相似性分析
|
|
9
|
+
|
|
10
|
+
2. LSD (Log-Spectral Distance) - 对数谱距离
|
|
11
|
+
- 在对数功率谱域计算信号距离
|
|
12
|
+
- 更符合人耳听觉特性,常用于语音质量评估
|
|
13
|
+
|
|
14
|
+
3. MCD (Mel-Cepstral Distance) - 梅尔倒谱距离
|
|
15
|
+
- 基于MFCC特征的音频相似性度量
|
|
16
|
+
- 广泛应用于语音合成、语音识别等任务
|
|
17
|
+
'''
|
|
18
|
+
|
|
19
|
+
import librosa
|
|
20
|
+
import numpy as np
|
|
21
|
+
import soundfile as sf
|
|
22
|
+
from neverlib.utils import EPS
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def sd(ref_wav, test_wav, n_fft=2048, hop_length=512, win_length=None):
|
|
26
|
+
"""
|
|
27
|
+
计算两个音频信号之间的频谱距离 (Spectral Distance)。
|
|
28
|
+
该指标衡量两个信号在频域上的差异程度。
|
|
29
|
+
Args:
|
|
30
|
+
ref_wav (np.ndarray): 参考音频信号 (一维数组)
|
|
31
|
+
test_wav (np.ndarray): 测试音频信号 (一维数组)
|
|
32
|
+
n_fft (int): FFT点数,决定频率分辨率,默认为2048
|
|
33
|
+
hop_length (int): 帧移,决定时间分辨率,默认为512
|
|
34
|
+
win_length (int, optional): 窗长,如果为None则默认为n_fft
|
|
35
|
+
Returns:
|
|
36
|
+
float: 频谱距离值,值越小表示两个信号越相似
|
|
37
|
+
"""
|
|
38
|
+
assert len(ref_wav) == len(test_wav), "输入信号长度必须相同"
|
|
39
|
+
|
|
40
|
+
# 计算短时傅里叶变换
|
|
41
|
+
ref_spec = librosa.stft(ref_wav, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
|
|
42
|
+
test_spec = librosa.stft(test_wav, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
|
|
43
|
+
|
|
44
|
+
# 计算频谱距离:均方根误差
|
|
45
|
+
spec_diff = ref_spec - test_spec
|
|
46
|
+
squared_diff = np.abs(spec_diff) ** 2
|
|
47
|
+
mean_squared_diff = np.mean(squared_diff)
|
|
48
|
+
sd_value = np.sqrt(mean_squared_diff)
|
|
49
|
+
|
|
50
|
+
return sd_value
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def lsd(ref_wav, test_wav, n_fft=2048, hop_length=512, win_length=None):
|
|
56
|
+
"""
|
|
57
|
+
计算两个一维音频信号之间的对数谱距离 (Log-Spectral Distance, LSD)。
|
|
58
|
+
该实现遵循标准的LSD定义: 整体均方根误差。
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
ref_wav (np.ndarray): 原始的、干净的参考信号 (一维数组)。
|
|
62
|
+
test_wav (np.ndarray): 模型估计或处理后的信号 (一维数组)。
|
|
63
|
+
n_fft (int): FFT点数, 决定了频率分辨率。
|
|
64
|
+
hop_length (int): 帧移, 决定了时间分辨率。
|
|
65
|
+
win_length (int, optional): 窗长。如果为None, 则默认为n_fft。
|
|
66
|
+
epsilon (float): 一个非常小的数值, 用于防止对零取对数, 保证数值稳定性。
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
float: 对数谱距离值, 单位为分贝 (dB)。
|
|
70
|
+
"""
|
|
71
|
+
assert ref_wav.ndim == 1 and test_wav.ndim == 1, "输入信号必须是一维数组。"
|
|
72
|
+
|
|
73
|
+
if win_length is None:
|
|
74
|
+
win_length = n_fft
|
|
75
|
+
|
|
76
|
+
ref_stft = librosa.stft(ref_wav, n_fft=n_fft, hop_length=hop_length, win_length=win_length) # (F,T)
|
|
77
|
+
test_stft = librosa.stft(test_wav, n_fft=n_fft, hop_length=hop_length, win_length=win_length) # (F,T)
|
|
78
|
+
|
|
79
|
+
ref_power_spec = np.abs(ref_stft) ** 2 # (F,T)
|
|
80
|
+
test_power_spec = np.abs(test_stft) ** 2 # (F,T)
|
|
81
|
+
|
|
82
|
+
ref_log_power_spec = 10 * np.log10(ref_power_spec + EPS)
|
|
83
|
+
test_log_power_spec = 10 * np.log10(test_power_spec + EPS)
|
|
84
|
+
|
|
85
|
+
squared_error = (ref_log_power_spec - test_log_power_spec) ** 2
|
|
86
|
+
lsd_val = np.sqrt(np.mean(squared_error))
|
|
87
|
+
|
|
88
|
+
return lsd_val
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def mcd(ref_wav, test_wav, sr=16000, n_mfcc=13):
|
|
92
|
+
"""
|
|
93
|
+
计算两个音频信号之间的梅尔倒谱距离 (Mel-Cepstral Distance, MCD)。
|
|
94
|
+
该指标常用于语音合成质量评估,值越小表示两个信号越相似。
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
ref_wav (np.ndarray): 参考音频信号 (一维数组)
|
|
98
|
+
test_wav (np.ndarray): 测试音频信号 (一维数组)
|
|
99
|
+
sr (int): 采样率,默认为16000Hz
|
|
100
|
+
n_mfcc (int): MFCC系数个数,默认为13
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
float: 梅尔倒谱距离值,值越小表示两个信号越相似
|
|
104
|
+
|
|
105
|
+
"""
|
|
106
|
+
assert len(ref_wav) == len(test_wav), "输入信号长度必须相同"
|
|
107
|
+
|
|
108
|
+
# 计算MFCC特征
|
|
109
|
+
ref_mfcc = librosa.feature.mfcc(y=ref_wav, sr=sr, n_mfcc=n_mfcc)
|
|
110
|
+
test_mfcc = librosa.feature.mfcc(y=test_wav, sr=sr, n_mfcc=n_mfcc)
|
|
111
|
+
|
|
112
|
+
# 计算MCD (跳过0阶系数,因为0阶主要表示能量)
|
|
113
|
+
diff = ref_mfcc[1:] - test_mfcc[1:]
|
|
114
|
+
mcd_value = (10.0 / np.log(10)) * np.sqrt(2 * np.mean(np.sum(diff ** 2, axis=0)))
|
|
115
|
+
|
|
116
|
+
return mcd_value
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
if __name__ == "__main__":
|
|
120
|
+
ref_file = "../data/vad_example.wav" # 参考语音文件路径
|
|
121
|
+
test_file = "../data/vad_example.wav" # 测试语音文件路径
|
|
122
|
+
|
|
123
|
+
ref_wav, ref_sr = sf.read(ref_file)
|
|
124
|
+
test_wav, test_sr = sf.read(test_file)
|
|
125
|
+
assert ref_sr == test_sr == 16000, "采样率必须为16000Hz"
|
|
126
|
+
assert len(ref_wav) == len(test_wav), "音频长度必须相同"
|
|
127
|
+
|
|
128
|
+
mcd_value = mcd(ref_wav, test_wav)
|
|
129
|
+
print(f"梅尔倒谱距离: {mcd_value:.2f}")
|
|
130
|
+
|
|
131
|
+
lsd_value = lsd(ref_wav, test_wav)
|
|
132
|
+
print(f"对数谱距离: {lsd_value:.2f}")
|
|
133
|
+
|
|
134
|
+
sd_value = sd(ref_wav, test_wav)
|
|
135
|
+
print(f"频谱距离: {sd_value:.2f}")
|
|
136
|
+
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Author: 凌逆战 | Never
|
|
3
|
+
Date: 2025-08-05 23:37:31
|
|
4
|
+
Description:
|
|
5
|
+
|
|
6
|
+
PESQ 包含 3 种类型的值:NB PESQ MOS、NB MOS LQO、WB MOS LQO。此包仅返回NB PESQ MOS代表 的Raw MOS分数narrowband handset listening。
|
|
7
|
+
'''
|
|
8
|
+
import pesq
|
|
9
|
+
import pypesq
|
|
10
|
+
import librosa
|
|
11
|
+
import os
|
|
12
|
+
import numpy as np
|
|
13
|
+
|
|
14
|
+
fs = 16000
|
|
15
|
+
clean = librosa.load("../data/000_short.wav", sr=fs)[0]
|
|
16
|
+
enhance = librosa.load("../data/000_short_enhance.wav", sr=fs)[0]
|
|
17
|
+
|
|
18
|
+
print(pesq.pesq(fs, clean, enhance, 'wb')) # 3.5920536518096924
|
|
19
|
+
print(pypesq.pesq(clean, enhance, fs=fs)) # 3.817176103591919
|
|
20
|
+
# os.system("./pesq_c/PESQ +16000 ../data/000_short.wav ../data/000_short_enhance.wav") # WB PESQ_MOS = 3.518
|
|
21
|
+
# os.system("./pesq_c/PESQ +8000 ../data/000_short.wav ../data/000_short_enhance.wav") # NB PESQ_MOS = 3.477
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def pesq2mos(pesq):
|
|
25
|
+
""" 将PESQ值[-0.5, 4.5]映射到MOS-LQO得分[1, 4.5]上,映射函数来源于:P.862.1 """
|
|
26
|
+
return 0.999 + (4.999 - 0.999) / (1 + np.exp(-1.4945 * pesq + 4.6607))
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def mos2pesq(mos):
|
|
30
|
+
""" 将MOS-LQO得分[1, 4.5]映射到PESQ值[-0.5, 4.5]上,映射函数来源于:P.862.1"""
|
|
31
|
+
inlog = (4.999 - mos) / (mos - 0.999)
|
|
32
|
+
return (4.6607 - np.log(inlog)) / 1.4945
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# print(mos2pesq(3.518))
|
neverlib/metrics/time.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Author: 凌逆战 | Never
|
|
3
|
+
Date: 2025-08-05 16:44:41
|
|
4
|
+
Description:
|
|
5
|
+
'''
|
|
6
|
+
"""
|
|
7
|
+
音频数据分析基础工具模块
|
|
8
|
+
Basic Utilities for Audio Data Analysis
|
|
9
|
+
|
|
10
|
+
提供音频分析的基础工具函数
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
import librosa
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def peak_amplitude(wav):
|
|
18
|
+
"""计算峰值幅度
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
wav: 音频信号 (*, ch)
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
峰值幅度 (dB)
|
|
25
|
+
"""
|
|
26
|
+
peak_amp = np.max(np.abs(wav))
|
|
27
|
+
return peak_amp
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def rms_amplitude(wav):
|
|
31
|
+
"""计算RMS幅度
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
wav: 音频信号 (*, ch)
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
RMS幅度
|
|
38
|
+
"""
|
|
39
|
+
return np.sqrt(np.mean(np.square(wav)))
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def mean_rms_amplitude(wav, frame_length=512, hop_length=256):
|
|
43
|
+
"""计算分帧平均RMS幅度
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
wav: 音频信号 (*, ch)
|
|
47
|
+
frame_length: 帧长度
|
|
48
|
+
hop_length: 跳跃长度
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
平均RMS幅度
|
|
52
|
+
"""
|
|
53
|
+
# 分帧
|
|
54
|
+
frame = librosa.util.frame(wav.flatten(), frame_length=frame_length, hop_length=hop_length)
|
|
55
|
+
rms_amp = np.sqrt(np.mean(np.square(frame), axis=0))
|
|
56
|
+
return np.mean(rms_amp)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def dc_offset(wav):
|
|
60
|
+
"""计算直流分量
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
wav: 音频信号 (*, ch)
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
直流分量
|
|
67
|
+
"""
|
|
68
|
+
return np.mean(wav)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Author: 凌逆战 | Never
|
|
3
|
+
Date: 2025-08-05 17:29:43
|
|
4
|
+
Description:
|
|
5
|
+
'''
|
|
6
|
+
import os
|
|
7
|
+
import sys
|
|
8
|
+
sys.path.append("../")
|
|
9
|
+
from vad.utils import vad2nad
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_vad2nad():
|
|
13
|
+
"""测试vad2nad函数"""
|
|
14
|
+
vad = [{'start': 100, 'end': 1000}, {'start': 2000, 'end': 3000}]
|
|
15
|
+
total_length = 4000
|
|
16
|
+
nad = vad2nad(vad, total_length)
|
|
17
|
+
print(nad)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
if __name__ == "__main__":
|
|
21
|
+
test_vad2nad()
|
neverlib/utils/audio_split.py
CHANGED
|
@@ -14,7 +14,7 @@ from pydub import AudioSegment
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def audio_split_ffmpeg(source_path, target_path, sr, channel_num, duration, endwith="*.pcm"):
|
|
17
|
-
"""
|
|
17
|
+
""" 切割音频切不准, 会留点尾巴0.016s
|
|
18
18
|
使用ffmpeg分割音频, 分割为短音频(单位:秒), 似乎无法非常准确的分割到指定长度
|
|
19
19
|
:param source_path: 源音频路径
|
|
20
20
|
:param target_path: 目标音频路径
|
|
@@ -94,6 +94,7 @@ def audio_split_np(source_path, target_path, sr, channel_num, duration, endwith=
|
|
|
94
94
|
|
|
95
95
|
# 注意读取时使用正确的dtype(例如int16表示16位PCM)
|
|
96
96
|
pcm_data = np.fromfile(wav_path, dtype=np.int16)
|
|
97
|
+
pcm_data = pcm_data[:(len(pcm_data) // channel_num) * channel_num]
|
|
97
98
|
pcm_data = pcm_data.reshape(-1, channel_num)
|
|
98
99
|
|
|
99
100
|
# 计算分割的数量
|
neverlib/utils/message.py
CHANGED
|
@@ -57,7 +57,7 @@ def send_QQEmail_with_images(title, content, from_name, from_email, from_passwor
|
|
|
57
57
|
:param from_email: 发件人邮箱
|
|
58
58
|
:param from_password: 发件人邮箱SMTP授权码
|
|
59
59
|
:param to_email: 收件人邮箱
|
|
60
|
-
:param image_paths:
|
|
60
|
+
:param image_paths: 图片文件路径列表, 应为PNG格式
|
|
61
61
|
"""
|
|
62
62
|
# 设置邮箱的域名
|
|
63
63
|
HOST = "smtp.qq.com"
|
|
@@ -87,10 +87,10 @@ def send_QQEmail_with_images(title, content, from_name, from_email, from_passwor
|
|
|
87
87
|
# 使用 MIMEImage 添加图片
|
|
88
88
|
image_part = MIMEImage(img_data)
|
|
89
89
|
|
|
90
|
-
# 设置Content-ID
|
|
90
|
+
# 设置Content-ID, 以便在正文中引用图片
|
|
91
91
|
image_part.add_header('Content-ID', cid)
|
|
92
92
|
|
|
93
|
-
# 设置为 inline
|
|
93
|
+
# 设置为 inline 显示, 避免附件处理
|
|
94
94
|
image_part.add_header('Content-Disposition', 'inline', filename=os.path.basename(image_path))
|
|
95
95
|
|
|
96
96
|
# 添加图片到邮件
|
|
@@ -115,7 +115,7 @@ def send_QQEmail_with_images(title, content, from_name, from_email, from_passwor
|
|
|
115
115
|
|
|
116
116
|
|
|
117
117
|
if __name__ == "__main__":
|
|
118
|
-
send_QQEmail("实验跑完", "
|
|
118
|
+
send_QQEmail("实验跑完", "实验跑完了, 快去看看吧!",
|
|
119
119
|
from_email="1786088386@qq.com", from_password="xxxx",
|
|
120
120
|
to_email="1786088386@qq.com")
|
|
121
121
|
pass
|
neverlib/utils/utils.py
CHANGED
|
@@ -12,6 +12,7 @@ from tqdm import tqdm
|
|
|
12
12
|
from datetime import datetime
|
|
13
13
|
import soundfile as sf
|
|
14
14
|
import numpy as np
|
|
15
|
+
EPS = np.finfo(float).eps
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
def get_path_list(source_path, end="*.wav", shuffle=False):
|
|
@@ -20,7 +21,8 @@ def get_path_list(source_path, end="*.wav", shuffle=False):
|
|
|
20
21
|
# 实现列表特殊字符的过滤或筛选,返回符合匹配“.wav”字符列表
|
|
21
22
|
for filename in fnmatch.filter(filenames, end):
|
|
22
23
|
wav_list.append(os.path.join(root, filename))
|
|
23
|
-
|
|
24
|
+
if os.environ.get("LOCAL_RANK", "0") == "0":
|
|
25
|
+
print(source_path, len(wav_list))
|
|
24
26
|
if shuffle:
|
|
25
27
|
random.shuffle(wav_list)
|
|
26
28
|
return wav_list
|
|
@@ -49,10 +51,10 @@ def rename_files_and_folders(directory, replace='_-', replacement='_'):
|
|
|
49
51
|
def get_file_time(file_path):
|
|
50
52
|
# 获取最后修改时间
|
|
51
53
|
mod_time = os.path.getmtime(file_path)
|
|
52
|
-
# 转为data_time
|
|
54
|
+
# 转为data_time格式: 年-月-日-时-分-秒
|
|
53
55
|
datetime_dt = datetime.fromtimestamp(mod_time)
|
|
54
56
|
|
|
55
|
-
# 如果时间早于2024-09-04 02:00:00
|
|
57
|
+
# 如果时间早于2024-09-04 02:00:00, 则删除
|
|
56
58
|
# if datetime_dt < datetime(2024, 9, 4, 2, 0, 0):
|
|
57
59
|
# print(file_path)
|
|
58
60
|
return datetime_dt
|
|
@@ -97,38 +99,38 @@ def TrainValTestSplit(dataset_dir, train_dir, val_dir, test_dir, percentage=[0.8
|
|
|
97
99
|
:param percentage: 分割百分比
|
|
98
100
|
"""
|
|
99
101
|
assert sum(percentage) == 1.0, "百分比总和必须等于1.0"
|
|
100
|
-
|
|
102
|
+
|
|
101
103
|
wav_path_list = sorted(get_path_list(dataset_dir, end="*.wav"))
|
|
102
104
|
random.seed(10086)
|
|
103
105
|
random.shuffle(wav_path_list) # 打乱列表的顺序
|
|
104
106
|
total_wav_num = len(wav_path_list)
|
|
105
|
-
|
|
107
|
+
|
|
106
108
|
# 计算训练集、验证集和测试集的分割点
|
|
107
109
|
train_split_idx = int(total_wav_num * percentage[0])
|
|
108
110
|
val_split_idx = train_split_idx + int(total_wav_num * percentage[1])
|
|
109
|
-
|
|
111
|
+
|
|
110
112
|
train_path_list = wav_path_list[:train_split_idx]
|
|
111
113
|
val_path_list = wav_path_list[train_split_idx:val_split_idx]
|
|
112
114
|
test_path_list = wav_path_list[val_split_idx:]
|
|
113
|
-
|
|
115
|
+
|
|
114
116
|
for train_wavpath in tqdm(train_path_list, desc="复制训练集音频"):
|
|
115
117
|
target_path = train_wavpath.replace(dataset_dir, train_dir)
|
|
116
118
|
if not os.path.exists(os.path.split(target_path)[0]):
|
|
117
119
|
os.makedirs(os.path.split(target_path)[0])
|
|
118
120
|
shutil.copy(train_wavpath, target_path)
|
|
119
|
-
|
|
121
|
+
|
|
120
122
|
for val_wavpath in tqdm(val_path_list, desc="复制验证集音频"):
|
|
121
123
|
target_path = val_wavpath.replace(dataset_dir, val_dir)
|
|
122
124
|
if not os.path.exists(os.path.split(target_path)[0]):
|
|
123
125
|
os.makedirs(os.path.split(target_path)[0])
|
|
124
126
|
shutil.copy(val_wavpath, target_path)
|
|
125
|
-
|
|
127
|
+
|
|
126
128
|
for test_wavpath in tqdm(test_path_list, desc="复制测试集音频"):
|
|
127
129
|
target_path = test_wavpath.replace(dataset_dir, test_dir)
|
|
128
130
|
if not os.path.exists(os.path.split(target_path)[0]):
|
|
129
131
|
os.makedirs(os.path.split(target_path)[0])
|
|
130
132
|
shutil.copy(test_wavpath, target_path)
|
|
131
|
-
|
|
133
|
+
|
|
132
134
|
print(f"完成! 训练集: {len(train_path_list)}个文件, 验证集: {len(val_path_list)}个文件, 测试集: {len(test_path_list)}个文件")
|
|
133
135
|
|
|
134
136
|
|
|
@@ -141,15 +143,33 @@ def get_leaf_folders(directory):
|
|
|
141
143
|
return leaf_folders
|
|
142
144
|
|
|
143
145
|
|
|
146
|
+
def del_empty_folders(path):
|
|
147
|
+
"""递归删除空文件夹(先删除子文件夹, 再删除父文件夹)"""
|
|
148
|
+
if not os.path.isdir(path):
|
|
149
|
+
return
|
|
150
|
+
|
|
151
|
+
# 获取子文件夹
|
|
152
|
+
subfolders = [os.path.join(path, d) for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
|
|
153
|
+
|
|
154
|
+
# 递归处理子文件夹
|
|
155
|
+
for subfolder in subfolders:
|
|
156
|
+
del_empty_folders(subfolder)
|
|
157
|
+
|
|
158
|
+
# 如果文件夹为空,则删除
|
|
159
|
+
if not os.listdir(path):
|
|
160
|
+
os.rmdir(path)
|
|
161
|
+
print(f"删除空文件夹: {path}")
|
|
162
|
+
|
|
163
|
+
|
|
144
164
|
def DatasetSubfloderSplit(source_dir, split_dirs, percentage=None):
|
|
145
165
|
"""
|
|
146
166
|
将一个数据集按照子文件夹数量分割成train/val/test数据集
|
|
147
167
|
Args:
|
|
148
168
|
source_dir (str): 源数据集目录
|
|
149
|
-
split_dirs (list):
|
|
150
|
-
percentage (list, optional):
|
|
151
|
-
-
|
|
152
|
-
-
|
|
169
|
+
split_dirs (list): 目标目录列表, 如 [train_dir, val_dir] 或 [train_dir, val_dir, test_dir]
|
|
170
|
+
percentage (list, optional): 分割比例, 如 [0.9, 0.1] 或 [0.8, 0.1, 0.1]。默认为 None, 此时:
|
|
171
|
+
- 如果是两路分割, 默认为 [0.9, 0.1]
|
|
172
|
+
- 如果是三路分割, 默认为 [0.8, 0.1, 0.1]
|
|
153
173
|
Example:
|
|
154
174
|
# 两路分割示例
|
|
155
175
|
DatasetSplit(
|
|
@@ -247,11 +267,11 @@ def save_weight_histogram(model, save_dir, mode=["params", "buffers"], ignore_na
|
|
|
247
267
|
Args:
|
|
248
268
|
model: PyTorch模型
|
|
249
269
|
save_dir: 保存路径
|
|
250
|
-
mode:
|
|
270
|
+
mode: 保存模式, 可选值为["params", "buffers"]
|
|
251
271
|
bins: 直方图bin数量
|
|
252
272
|
"""
|
|
253
273
|
import matplotlib.pyplot as plt
|
|
254
|
-
#
|
|
274
|
+
# 如果路径存在, 则删除
|
|
255
275
|
if os.path.exists(save_dir):
|
|
256
276
|
shutil.rmtree(save_dir)
|
|
257
277
|
|
neverlib/vad/PreProcess.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
'''
|
|
2
2
|
Author: 凌逆战 | Never
|
|
3
3
|
Date: 2025-02-13 20:06:07
|
|
4
|
-
LastEditTime: 2025-
|
|
4
|
+
LastEditTime: 2025-08-16 02:07:24
|
|
5
5
|
FilePath: \neverlib\vad\PreProcess.py
|
|
6
6
|
Description:
|
|
7
7
|
'''
|
|
@@ -9,13 +9,16 @@ Description:
|
|
|
9
9
|
# Author:凌逆战 | Never
|
|
10
10
|
# Date: 2024/9/14
|
|
11
11
|
"""
|
|
12
|
-
|
|
12
|
+
通过一些预处理方法, 来提高VAD的准确率
|
|
13
13
|
"""
|
|
14
14
|
import numpy as np
|
|
15
15
|
import noisereduce as nr
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
def pre_emphasis(audio_data, alpha=0.97):
|
|
19
|
+
"""
|
|
20
|
+
预加重
|
|
21
|
+
"""
|
|
19
22
|
# y(n)=x(n)−α⋅x(n−1)
|
|
20
23
|
emphasized_audio = np.append(audio_data[0], audio_data[1:] - alpha * audio_data[:-1])
|
|
21
24
|
return emphasized_audio
|
|
@@ -45,7 +48,7 @@ def NS(wav, sr=16000, stationary=True, prop_decrease=1.):
|
|
|
45
48
|
def NS_test():
|
|
46
49
|
import soundfile as sf
|
|
47
50
|
sr = 16000
|
|
48
|
-
wav_path = "../../
|
|
51
|
+
wav_path = "../../data/vad_example.wav"
|
|
49
52
|
wav, wav_sr = sf.read(wav_path, always_2d=False, dtype="float32")
|
|
50
53
|
wav_NS = NS(wav, sr=sr, stationary=True, prop_decrease=0.6)
|
|
51
54
|
sf.write("../../wav_data/000_short_NS.wav", wav_NS, samplerate=sr)
|