neverlib 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- neverlib/.history/Docs/audio_aug/test_snr_20250806011311.py +0 -0
- neverlib/.history/Docs/audio_aug/test_snr_20250806011331.py +75 -0
- neverlib/.history/Docs/audio_aug/test_snr_20250806011342.py +57 -0
- neverlib/.history/Docs/audio_aug/test_snr_20250806011352.py +57 -0
- neverlib/.history/Docs/audio_aug/test_snr_20250806011403.py +57 -0
- neverlib/.history/Docs/audio_aug/test_snr_20250806011413.py +57 -0
- neverlib/.history/Docs/audio_aug/test_snr_20250806011435.py +55 -0
- neverlib/.history/Docs/vad/1_20250810032405.py +0 -0
- neverlib/.history/Docs/vad/1_20250810032417.py +39 -0
- neverlib/.history/audio_aug/audio_aug_20250806010451.py +125 -0
- neverlib/.history/audio_aug/audio_aug_20250806010750.py +138 -0
- neverlib/.history/audio_aug/audio_aug_20250806010759.py +140 -0
- neverlib/.history/audio_aug/audio_aug_20250806010803.py +140 -0
- neverlib/.history/audio_aug/audio_aug_20250806010809.py +140 -0
- neverlib/.history/audio_aug/audio_aug_20250806011108.py +140 -0
- neverlib/.history/dataAnalyze/__init___20250806204125.py +14 -0
- neverlib/.history/dataAnalyze/__init___20250806204139.py +14 -0
- neverlib/.history/dataAnalyze/__init___20250806204159.py +14 -0
- neverlib/.history/filter/__init___20250820103351.py +70 -0
- neverlib/.history/filter/__init___20250821102348.py +70 -0
- neverlib/.history/filter/__init___20250821102405.py +14 -0
- neverlib/.history/filter/auto_eq/__init___20250819213121.py +36 -0
- neverlib/.history/filter/auto_eq/__init___20250821102241.py +36 -0
- neverlib/.history/filter/auto_eq/__init___20250821102259.py +36 -0
- neverlib/.history/filter/auto_eq/__init___20250821102307.py +36 -0
- neverlib/.history/filter/auto_eq/__init___20250821102310.py +36 -0
- neverlib/.history/filter/auto_eq/__init___20250821102318.py +36 -0
- neverlib/.history/filter/auto_eq/__init___20250821102507.py +36 -0
- neverlib/{filter/AudoEQ/auto_eq_de.py → .history/filter/auto_eq/de_eq_20250820103848.py} +1 -1
- neverlib/.history/filter/auto_eq/de_eq_20250821102422.py +360 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820140732.py +75 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820140745.py +75 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820140816.py +75 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820140938.py +77 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141003.py +77 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141006.py +77 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141019.py +77 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141049.py +77 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141211.py +77 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141227.py +77 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141311.py +78 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141340.py +78 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141712.py +78 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141733.py +78 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141755.py +78 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250821102434.py +76 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250821102500.py +76 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250821102502.py +76 -0
- neverlib/{filter/AudoEQ/auto_eq_ga_basic.py → .history/filter/auto_eq/ga_eq_basic_20250820102957.py} +1 -1
- neverlib/.history/filter/auto_eq/ga_eq_basic_20250820113054.py +380 -0
- neverlib/.history/filter/auto_eq/ga_eq_basic_20250820113150.py +380 -0
- neverlib/.history/filter/auto_eq/ga_eq_basic_20250820113520.py +385 -0
- neverlib/.history/filter/auto_eq/ga_eq_basic_20250820113525.py +385 -0
- neverlib/.history/filter/auto_eq/ga_eq_basic_20250821102212.py +385 -0
- neverlib/.history/metrics/dnsmos_20250806001612.py +160 -0
- neverlib/.history/metrics/dnsmos_20250815180659.py +160 -0
- neverlib/.history/metrics/dnsmos_20250815180701.py +158 -0
- neverlib/.history/metrics/dnsmos_20250815181321.py +154 -0
- neverlib/.history/metrics/dnsmos_20250815181327.py +154 -0
- neverlib/.history/metrics/dnsmos_20250815181331.py +154 -0
- neverlib/.history/metrics/dnsmos_20250815181620.py +154 -0
- neverlib/.history/metrics/dnsmos_20250815181631.py +154 -0
- neverlib/.history/metrics/dnsmos_20250815181742.py +154 -0
- neverlib/.history/metrics/dnsmos_20250815181824.py +153 -0
- neverlib/.history/metrics/dnsmos_20250815181834.py +153 -0
- neverlib/.history/metrics/dnsmos_20250815181922.py +153 -0
- neverlib/.history/metrics/dnsmos_20250815182011.py +147 -0
- neverlib/.history/metrics/dnsmos_20250815182036.py +144 -0
- neverlib/.history/metrics/dnsmos_20250815182936.py +143 -0
- neverlib/.history/metrics/dnsmos_20250815182942.py +143 -0
- neverlib/.history/metrics/dnsmos_20250815183032.py +137 -0
- neverlib/.history/metrics/dnsmos_20250815183101.py +144 -0
- neverlib/.history/metrics/dnsmos_20250815183121.py +144 -0
- neverlib/.history/metrics/dnsmos_20250815183123.py +143 -0
- neverlib/.history/metrics/dnsmos_20250815183214.py +143 -0
- neverlib/.history/metrics/dnsmos_20250815183240.py +143 -0
- neverlib/.history/metrics/dnsmos_20250815183248.py +144 -0
- neverlib/.history/metrics/dnsmos_20250815183407.py +142 -0
- neverlib/.history/metrics/dnsmos_20250815183409.py +142 -0
- neverlib/.history/metrics/dnsmos_20250815183431.py +142 -0
- neverlib/.history/metrics/dnsmos_20250815183507.py +140 -0
- neverlib/.history/metrics/dnsmos_20250815183513.py +139 -0
- neverlib/.history/metrics/dnsmos_20250815183618.py +139 -0
- neverlib/.history/metrics/dnsmos_20250815183709.py +140 -0
- neverlib/.history/metrics/dnsmos_20250815183756.py +137 -0
- neverlib/.history/metrics/dnsmos_20250815183815.py +128 -0
- neverlib/.history/metrics/dnsmos_20250815183827.py +129 -0
- neverlib/.history/metrics/dnsmos_20250815183913.py +117 -0
- neverlib/.history/metrics/dnsmos_20250815183914.py +117 -0
- neverlib/.history/metrics/dnsmos_20250815184003.py +118 -0
- neverlib/.history/metrics/dnsmos_20250815184040.py +118 -0
- neverlib/.history/metrics/dnsmos_20250815184049.py +118 -0
- neverlib/.history/metrics/dnsmos_20250815184104.py +117 -0
- neverlib/.history/metrics/dnsmos_20250815184200.py +117 -0
- neverlib/.history/metrics/lpc_lsp_metric_20250816015944.py +128 -0
- neverlib/.history/metrics/lpc_lsp_metric_20250816020142.py +128 -0
- neverlib/.history/metrics/lpc_lsp_metric_20250816020156.py +128 -0
- neverlib/.history/metrics/lpc_lsp_metric_20250816020554.py +130 -0
- neverlib/.history/metrics/lpc_lsp_metric_20250816020600.py +125 -0
- neverlib/.history/metrics/lpc_lsp_metric_20250816020631.py +120 -0
- neverlib/.history/metrics/lpc_lsp_metric_20250816020746.py +118 -0
- neverlib/.history/metrics/lpc_me_20250816013111.py +0 -0
- neverlib/.history/metrics/lpc_me_20250816013129.py +121 -0
- neverlib/.history/metrics/lpc_me_20250816015430.py +103 -0
- neverlib/.history/metrics/lpc_me_20250816015535.py +96 -0
- neverlib/.history/metrics/lpc_me_20250816015542.py +96 -0
- neverlib/.history/metrics/lpc_me_20250816015636.py +97 -0
- neverlib/.history/metrics/lpc_me_20250816015658.py +104 -0
- neverlib/.history/metrics/lpc_me_20250816015703.py +100 -0
- neverlib/.history/metrics/lpc_me_20250816015945.py +128 -0
- neverlib/.history/metrics/snr_20250806010538.py +177 -0
- neverlib/.history/metrics/snr_20250806211634.py +184 -0
- neverlib/.history/metrics/spec_20250805234209.py +45 -0
- neverlib/.history/metrics/spec_20250816135530.py +11 -0
- neverlib/.history/metrics/spec_20250816135654.py +16 -0
- neverlib/.history/metrics/spec_20250816135736.py +68 -0
- neverlib/.history/metrics/spec_20250816135904.py +75 -0
- neverlib/.history/metrics/spec_20250816135921.py +82 -0
- neverlib/.history/metrics/spec_20250816140111.py +82 -0
- neverlib/.history/metrics/spec_20250816140543.py +136 -0
- neverlib/.history/metrics/spec_20250816140559.py +172 -0
- neverlib/.history/metrics/spec_20250816140602.py +172 -0
- neverlib/.history/metrics/spec_20250816140608.py +172 -0
- neverlib/.history/metrics/spec_20250816140654.py +148 -0
- neverlib/.history/metrics/spec_20250816140705.py +144 -0
- neverlib/.history/metrics/spec_20250816140755.py +138 -0
- neverlib/.history/metrics/spec_20250816140823.py +170 -0
- neverlib/.history/metrics/spec_20250816140832.py +170 -0
- neverlib/.history/metrics/spec_20250816140833.py +170 -0
- neverlib/.history/metrics/spec_20250816140922.py +147 -0
- neverlib/.history/metrics/spec_20250816141148.py +107 -0
- neverlib/.history/metrics/spec_20250816141219.py +123 -0
- neverlib/.history/metrics/spec_20250816141732.py +178 -0
- neverlib/.history/metrics/spec_20250816141740.py +178 -0
- neverlib/.history/metrics/spec_20250816142030.py +178 -0
- neverlib/.history/metrics/spec_20250816142107.py +135 -0
- neverlib/.history/metrics/spec_20250816142126.py +135 -0
- neverlib/.history/metrics/spec_20250816142410.py +135 -0
- neverlib/.history/metrics/spec_20250816142415.py +136 -0
- neverlib/.history/metrics/spec_metric_20250816135156.py +0 -0
- neverlib/.history/metrics/spec_metric_20250816135226.py +5 -0
- neverlib/.history/metrics/spec_metric_20250816135227.py +10 -0
- neverlib/.history/metrics/spec_metric_20250816135306.py +15 -0
- neverlib/.history/metrics/spec_metric_20250816135442.py +31 -0
- neverlib/.history/metrics/spec_metric_20250816135448.py +31 -0
- neverlib/.history/metrics/spec_metric_20250816135520.py +29 -0
- neverlib/.history/metrics/spec_metric_20250816135537.py +63 -0
- neverlib/.history/metrics/spec_metric_20250816135653.py +65 -0
- neverlib/.history/vad/PreProcess_20250805234211.py +63 -0
- neverlib/.history/vad/PreProcess_20250809232455.py +63 -0
- neverlib/.history/vad/PreProcess_20250816020725.py +66 -0
- neverlib/.history/vad/VAD_Silero_20250805234211.py +50 -0
- neverlib/.history/vad/VAD_Silero_20250809232456.py +50 -0
- neverlib/.history/vad/VAD_WebRTC_20250805234211.py +61 -0
- neverlib/.history/vad/VAD_WebRTC_20250809232456.py +61 -0
- neverlib/.history/vad/VAD_funasr_20250805234211.py +54 -0
- neverlib/.history/vad/VAD_funasr_20250809232456.py +54 -0
- neverlib/.history/vad/VAD_vadlib_20250805234211.py +70 -0
- neverlib/.history/vad/VAD_vadlib_20250809232455.py +70 -0
- neverlib/.history/vad/VAD_whisper_20250805234211.py +55 -0
- neverlib/.history/vad/VAD_whisper_20250809232456.py +55 -0
- neverlib/.specstory/.what-is-this.md +69 -0
- neverlib/.specstory/history/2025-08-05_17-06Z-/350/277/231/344/270/200/346/255/245/347/232/204/347/233/256/347/232/204/346/230/257/344/273/200/344/271/210.md +424 -0
- neverlib/Docs/audio_aug/test_snr.py +55 -0
- neverlib/audio_aug/HarmonicDistortion.py +79 -0
- neverlib/audio_aug/TFDrop.py +41 -0
- neverlib/audio_aug/TFMask.py +56 -0
- neverlib/audio_aug/audio_aug.py +16 -1
- neverlib/audio_aug/clip_aug.py +41 -0
- neverlib/audio_aug/coder_aug.py +209 -0
- neverlib/audio_aug/coder_aug2.py +118 -0
- neverlib/audio_aug/loss_packet_aug.py +103 -0
- neverlib/audio_aug/quant_aug.py +78 -0
- neverlib/data_analyze/__init__.py +14 -0
- neverlib/filter/auto_eq/__init__.py +36 -0
- neverlib/filter/auto_eq/de_eq.py +360 -0
- neverlib/filter/auto_eq/freq_eq.py +76 -0
- neverlib/filter/{AudoEQ/auto_eq_ga_advanced.py → auto_eq/ga_eq_advanced.py} +1 -1
- neverlib/filter/auto_eq/ga_eq_basic.py +385 -0
- neverlib/metrics/dnsmos.py +58 -101
- neverlib/metrics/lpc_lsp.py +118 -0
- neverlib/metrics/snr.py +11 -4
- neverlib/metrics/spec.py +136 -45
- neverlib/utils/utils.py +17 -14
- neverlib/vad/PreProcess.py +5 -2
- neverlib/vad/VAD_Silero.py +1 -1
- neverlib/vad/VAD_WebRTC.py +1 -1
- neverlib/vad/VAD_funasr.py +1 -1
- neverlib/vad/VAD_vadlib.py +1 -1
- neverlib/vad/VAD_whisper.py +1 -1
- {neverlib-0.2.3.dist-info → neverlib-0.2.4.dist-info}/METADATA +1 -1
- neverlib-0.2.4.dist-info/RECORD +229 -0
- neverlib-0.2.3.dist-info/RECORD +0 -53
- /neverlib/{dataAnalyze/__init__.py → .history/dataAnalyze/__init___20250805234204.py} +0 -0
- /neverlib/{filter/AudoEQ/auto_eq_spectral_direct.py → .history/filter/auto_eq/freq_eq_20250805234206.py} +0 -0
- /neverlib/{dataAnalyze → data_analyze}/README.md +0 -0
- /neverlib/{dataAnalyze → data_analyze}/dataset_analyzer.py +0 -0
- /neverlib/{dataAnalyze → data_analyze}/quality_metrics.py +0 -0
- /neverlib/{dataAnalyze → data_analyze}/rms_distrubution.py +0 -0
- /neverlib/{dataAnalyze → data_analyze}/spectral_analysis.py +0 -0
- /neverlib/{dataAnalyze → data_analyze}/statistics.py +0 -0
- /neverlib/{dataAnalyze → data_analyze}/temporal_features.py +0 -0
- /neverlib/{dataAnalyze → data_analyze}/visualization.py +0 -0
- /neverlib/filter/{AudoEQ → auto_eq}/README.md +0 -0
- {neverlib-0.2.3.dist-info → neverlib-0.2.4.dist-info}/WHEEL +0 -0
- {neverlib-0.2.3.dist-info → neverlib-0.2.4.dist-info}/licenses/LICENSE +0 -0
- {neverlib-0.2.3.dist-info → neverlib-0.2.4.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Author: 凌逆战 | Never
|
|
3
|
+
Date: 2025-08-16 13:51:57
|
|
4
|
+
Description:
|
|
5
|
+
'''
|
|
6
|
+
|
|
7
|
+
import librosa
|
|
8
|
+
import numpy as np
|
|
9
|
+
import soundfile as sf
|
|
10
|
+
|
|
11
|
+
def lsd(reference, estimate, n_fft=2048, hop_length=512, win_length=None):
|
|
12
|
+
"""
|
|
13
|
+
计算两个一维音频信号之间的对数谱距离 (Log-Spectral Distance, LSD)。
|
|
14
|
+
该实现遵循标准的LSD定义: 整体均方根误差。
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
reference (np.ndarray): 原始的、干净的参考信号 (一维数组)。
|
|
18
|
+
estimate (np.ndarray): 模型估计或处理后的信号 (一维数组)。
|
|
19
|
+
n_fft (int): FFT点数, 决定了频率分辨率。
|
|
20
|
+
hop_length (int): 帧移, 决定了时间分辨率。
|
|
21
|
+
win_length (int, optional): 窗长。如果为None, 则默认为n_fft。
|
|
22
|
+
epsilon (float): 一个非常小的数值, 用于防止对零取对数, 保证数值稳定性。
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
float: 对数谱距离值, 单位为分贝 (dB)。
|
|
26
|
+
"""
|
|
27
|
+
assert reference.ndim == 1 and estimate.ndim == 1, "输入信号必须是一维数组。"
|
|
28
|
+
|
|
29
|
+
if win_length is None:
|
|
30
|
+
win_length = n_fft
|
|
31
|
+
|
|
32
|
+
reference_stft = librosa.stft(reference, n_fft=n_fft, hop_length=hop_length, win_length=win_length) # (F,T)
|
|
33
|
+
estimate_stft = librosa.stft(estimate, n_fft=n_fft, hop_length=hop_length, win_length=win_length) # (F,T)
|
|
34
|
+
|
|
35
|
+
reference_power_spec = np.abs(reference_stft) ** 2 # (F,T)
|
|
36
|
+
estimate_power_spec = np.abs(estimate_stft) ** 2 # (F,T)
|
|
37
|
+
|
|
38
|
+
reference_log_power_spec = 10 * np.log10(reference_power_spec + EPS)
|
|
39
|
+
estimate_log_power_spec = 10 * np.log10(estimate_power_spec + EPS)
|
|
40
|
+
|
|
41
|
+
squared_error = (reference_log_power_spec - estimate_log_power_spec) ** 2
|
|
42
|
+
lsd_val = np.sqrt(np.mean(squared_error))
|
|
43
|
+
|
|
44
|
+
return lsd_val
|
|
45
|
+
|
|
46
|
+
def mcd(ref_wav, test_wav, sr=16000):
|
|
47
|
+
"""
|
|
48
|
+
梅尔倒谱距离 Mel-Cepstral Distance
|
|
49
|
+
ref_spec: 参考频谱
|
|
50
|
+
test_spec: 测试频谱
|
|
51
|
+
"""
|
|
52
|
+
ref_wav, ref_sr = sf.read(ref_wav)
|
|
53
|
+
test_wav, test_sr = sf.read(test_wav)
|
|
54
|
+
assert ref_sr == test_sr == sr, "采样率必须为16000Hz"
|
|
55
|
+
assert len(ref_wav) == len(test_wav), "音频长度必须相同"
|
|
56
|
+
|
|
57
|
+
ref_mfcc = librosa.feature.mfcc(y=ref_wav, sr=sr)
|
|
58
|
+
test_mfcc = librosa.feature.mfcc(y=test_wav, sr=sr)
|
|
59
|
+
|
|
60
|
+
# 计算 MCD (跳过 0 阶)
|
|
61
|
+
diff = ref_mfcc[1:] - test_mfcc[1:]
|
|
62
|
+
mcd = (10.0 / np.log(10)) * np.sqrt(2 * np.mean(np.sum(diff ** 2, axis=0)))
|
|
63
|
+
return mcd
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Author: 凌逆战 | Never
|
|
3
|
+
Date: 2025-08-16 13:51:57
|
|
4
|
+
Description:
|
|
5
|
+
'''
|
|
6
|
+
|
|
7
|
+
import librosa
|
|
8
|
+
import numpy as np
|
|
9
|
+
import soundfile as sf
|
|
10
|
+
from utils import EPS
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def lsd(ref_wav, test_wav, n_fft=2048, hop_length=512, win_length=None):
|
|
14
|
+
"""
|
|
15
|
+
计算两个一维音频信号之间的对数谱距离 (Log-Spectral Distance, LSD)。
|
|
16
|
+
该实现遵循标准的LSD定义: 整体均方根误差。
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
ref_wav (np.ndarray): 原始的、干净的参考信号 (一维数组)。
|
|
20
|
+
test_wav (np.ndarray): 模型估计或处理后的信号 (一维数组)。
|
|
21
|
+
n_fft (int): FFT点数, 决定了频率分辨率。
|
|
22
|
+
hop_length (int): 帧移, 决定了时间分辨率。
|
|
23
|
+
win_length (int, optional): 窗长。如果为None, 则默认为n_fft。
|
|
24
|
+
epsilon (float): 一个非常小的数值, 用于防止对零取对数, 保证数值稳定性。
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
float: 对数谱距离值, 单位为分贝 (dB)。
|
|
28
|
+
"""
|
|
29
|
+
assert ref_wav.ndim == 1 and test_wav.ndim == 1, "输入信号必须是一维数组。"
|
|
30
|
+
|
|
31
|
+
if win_length is None:
|
|
32
|
+
win_length = n_fft
|
|
33
|
+
|
|
34
|
+
ref_stft = librosa.stft(ref_wav, n_fft=n_fft, hop_length=hop_length, win_length=win_length) # (F,T)
|
|
35
|
+
test_stft = librosa.stft(test_wav, n_fft=n_fft, hop_length=hop_length, win_length=win_length) # (F,T)
|
|
36
|
+
|
|
37
|
+
ref_power_spec = np.abs(ref_stft) ** 2 # (F,T)
|
|
38
|
+
test_power_spec = np.abs(test_stft) ** 2 # (F,T)
|
|
39
|
+
|
|
40
|
+
ref_log_power_spec = 10 * np.log10(ref_power_spec + EPS)
|
|
41
|
+
test_log_power_spec = 10 * np.log10(test_power_spec + EPS)
|
|
42
|
+
|
|
43
|
+
squared_error = (ref_log_power_spec - test_log_power_spec) ** 2
|
|
44
|
+
lsd_val = np.sqrt(np.mean(squared_error))
|
|
45
|
+
|
|
46
|
+
return lsd_val
|
|
47
|
+
|
|
48
|
+
def mcd(ref_wav, test_wav, sr=16000):
|
|
49
|
+
"""
|
|
50
|
+
梅尔倒谱距离 Mel-Cepstral Distance
|
|
51
|
+
ref_spec: 参考频谱
|
|
52
|
+
test_spec: 测试频谱
|
|
53
|
+
"""
|
|
54
|
+
ref_wav, ref_sr = sf.read(ref_wav)
|
|
55
|
+
test_wav, test_sr = sf.read(test_wav)
|
|
56
|
+
assert ref_sr == test_sr == sr, "采样率必须为16000Hz"
|
|
57
|
+
assert len(ref_wav) == len(test_wav), "音频长度必须相同"
|
|
58
|
+
|
|
59
|
+
ref_mfcc = librosa.feature.mfcc(y=ref_wav, sr=sr)
|
|
60
|
+
test_mfcc = librosa.feature.mfcc(y=test_wav, sr=sr)
|
|
61
|
+
|
|
62
|
+
# 计算 MCD (跳过 0 阶)
|
|
63
|
+
diff = ref_mfcc[1:] - test_mfcc[1:]
|
|
64
|
+
mcd = (10.0 / np.log(10)) * np.sqrt(2 * np.mean(np.sum(diff ** 2, axis=0)))
|
|
65
|
+
return mcd
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Author: 凌逆战 | Never
|
|
3
|
+
Date: 2025-02-13 20:06:07
|
|
4
|
+
LastEditTime: 2025-03-17 16:06:11
|
|
5
|
+
FilePath: \neverlib\vad\PreProcess.py
|
|
6
|
+
Description:
|
|
7
|
+
'''
|
|
8
|
+
# -*- coding:utf-8 -*-
|
|
9
|
+
# Author:凌逆战 | Never
|
|
10
|
+
# Date: 2024/9/14
|
|
11
|
+
"""
|
|
12
|
+
通过一些预处理方法, 来提高VAD的准确率
|
|
13
|
+
"""
|
|
14
|
+
import numpy as np
|
|
15
|
+
import noisereduce as nr
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def pre_emphasis(audio_data, alpha=0.97):
|
|
19
|
+
# y(n)=x(n)−α⋅x(n−1)
|
|
20
|
+
emphasized_audio = np.append(audio_data[0], audio_data[1:] - alpha * audio_data[:-1])
|
|
21
|
+
return emphasized_audio
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def NS(wav, sr=16000, stationary=True, prop_decrease=1.):
|
|
25
|
+
""" 传统降噪 Doc: https://pypi.org/project/noisereduce/
|
|
26
|
+
:param wav: (xxx,) or (channels, xxx)
|
|
27
|
+
:param sr: 采样率
|
|
28
|
+
:param stationary: 平稳降噪还是非平稳降噪
|
|
29
|
+
:param prop_decrease: 0~1, 降噪噪声百分比
|
|
30
|
+
:return:
|
|
31
|
+
"""
|
|
32
|
+
if stationary:
|
|
33
|
+
# 平稳噪声抑制 stationary=True
|
|
34
|
+
reduced_noise = nr.reduce_noise(y=wav, sr=sr, stationary=True,
|
|
35
|
+
prop_decrease=prop_decrease, # 降噪噪声的比例
|
|
36
|
+
)
|
|
37
|
+
else:
|
|
38
|
+
# 非平稳噪声抑制 stationary=False
|
|
39
|
+
reduced_noise = nr.reduce_noise(y=wav, sr=sr, stationary=False,
|
|
40
|
+
prop_decrease=prop_decrease,
|
|
41
|
+
)
|
|
42
|
+
return reduced_noise
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def NS_test():
|
|
46
|
+
import soundfile as sf
|
|
47
|
+
sr = 16000
|
|
48
|
+
wav_path = "../../data/vad_example.wav"
|
|
49
|
+
wav, wav_sr = sf.read(wav_path, always_2d=False, dtype="float32")
|
|
50
|
+
wav_NS = NS(wav, sr=sr, stationary=True, prop_decrease=0.6)
|
|
51
|
+
sf.write("../../wav_data/000_short_NS.wav", wav_NS, samplerate=sr)
|
|
52
|
+
|
|
53
|
+
# 绘制降噪后的频谱图
|
|
54
|
+
import matplotlib.pyplot as plt
|
|
55
|
+
plt.subplot(211)
|
|
56
|
+
plt.specgram(wav, Fs=sr, scale_by_freq=True, sides='default', cmap="jet")
|
|
57
|
+
plt.subplot(212)
|
|
58
|
+
plt.specgram(wav_NS, Fs=sr, scale_by_freq=True, sides='default', cmap="jet")
|
|
59
|
+
plt.show()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
if __name__ == "__main__":
|
|
63
|
+
NS_test()
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Author: 凌逆战 | Never
|
|
3
|
+
Date: 2025-02-13 20:06:07
|
|
4
|
+
LastEditTime: 2025-03-17 16:06:11
|
|
5
|
+
FilePath: \neverlib\vad\PreProcess.py
|
|
6
|
+
Description:
|
|
7
|
+
'''
|
|
8
|
+
# -*- coding:utf-8 -*-
|
|
9
|
+
# Author:凌逆战 | Never
|
|
10
|
+
# Date: 2024/9/14
|
|
11
|
+
"""
|
|
12
|
+
通过一些预处理方法, 来提高VAD的准确率
|
|
13
|
+
"""
|
|
14
|
+
import numpy as np
|
|
15
|
+
import noisereduce as nr
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def pre_emphasis(audio_data, alpha=0.97):
|
|
19
|
+
# y(n)=x(n)−α⋅x(n−1)
|
|
20
|
+
emphasized_audio = np.append(audio_data[0], audio_data[1:] - alpha * audio_data[:-1])
|
|
21
|
+
return emphasized_audio
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def NS(wav, sr=16000, stationary=True, prop_decrease=1.):
|
|
25
|
+
""" 传统降噪 Doc: https://pypi.org/project/noisereduce/
|
|
26
|
+
:param wav: (xxx,) or (channels, xxx)
|
|
27
|
+
:param sr: 采样率
|
|
28
|
+
:param stationary: 平稳降噪还是非平稳降噪
|
|
29
|
+
:param prop_decrease: 0~1, 降噪噪声百分比
|
|
30
|
+
:return:
|
|
31
|
+
"""
|
|
32
|
+
if stationary:
|
|
33
|
+
# 平稳噪声抑制 stationary=True
|
|
34
|
+
reduced_noise = nr.reduce_noise(y=wav, sr=sr, stationary=True,
|
|
35
|
+
prop_decrease=prop_decrease, # 降噪噪声的比例
|
|
36
|
+
)
|
|
37
|
+
else:
|
|
38
|
+
# 非平稳噪声抑制 stationary=False
|
|
39
|
+
reduced_noise = nr.reduce_noise(y=wav, sr=sr, stationary=False,
|
|
40
|
+
prop_decrease=prop_decrease,
|
|
41
|
+
)
|
|
42
|
+
return reduced_noise
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def NS_test():
|
|
46
|
+
import soundfile as sf
|
|
47
|
+
sr = 16000
|
|
48
|
+
wav_path = "../../data/vad_example.wav"
|
|
49
|
+
wav, wav_sr = sf.read(wav_path, always_2d=False, dtype="float32")
|
|
50
|
+
wav_NS = NS(wav, sr=sr, stationary=True, prop_decrease=0.6)
|
|
51
|
+
sf.write("../../wav_data/000_short_NS.wav", wav_NS, samplerate=sr)
|
|
52
|
+
|
|
53
|
+
# 绘制降噪后的频谱图
|
|
54
|
+
import matplotlib.pyplot as plt
|
|
55
|
+
plt.subplot(211)
|
|
56
|
+
plt.specgram(wav, Fs=sr, scale_by_freq=True, sides='default', cmap="jet")
|
|
57
|
+
plt.subplot(212)
|
|
58
|
+
plt.specgram(wav_NS, Fs=sr, scale_by_freq=True, sides='default', cmap="jet")
|
|
59
|
+
plt.show()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
if __name__ == "__main__":
|
|
63
|
+
NS_test()
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Author: 凌逆战 | Never
|
|
3
|
+
Date: 2025-02-13 20:06:07
|
|
4
|
+
LastEditTime: 2025-08-16 02:07:24
|
|
5
|
+
FilePath: \neverlib\vad\PreProcess.py
|
|
6
|
+
Description:
|
|
7
|
+
'''
|
|
8
|
+
# -*- coding:utf-8 -*-
|
|
9
|
+
# Author:凌逆战 | Never
|
|
10
|
+
# Date: 2024/9/14
|
|
11
|
+
"""
|
|
12
|
+
通过一些预处理方法, 来提高VAD的准确率
|
|
13
|
+
"""
|
|
14
|
+
import numpy as np
|
|
15
|
+
import noisereduce as nr
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def pre_emphasis(audio_data, alpha=0.97):
|
|
19
|
+
"""
|
|
20
|
+
预加重
|
|
21
|
+
"""
|
|
22
|
+
# y(n)=x(n)−α⋅x(n−1)
|
|
23
|
+
emphasized_audio = np.append(audio_data[0], audio_data[1:] - alpha * audio_data[:-1])
|
|
24
|
+
return emphasized_audio
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def NS(wav, sr=16000, stationary=True, prop_decrease=1.):
|
|
28
|
+
""" 传统降噪 Doc: https://pypi.org/project/noisereduce/
|
|
29
|
+
:param wav: (xxx,) or (channels, xxx)
|
|
30
|
+
:param sr: 采样率
|
|
31
|
+
:param stationary: 平稳降噪还是非平稳降噪
|
|
32
|
+
:param prop_decrease: 0~1, 降噪噪声百分比
|
|
33
|
+
:return:
|
|
34
|
+
"""
|
|
35
|
+
if stationary:
|
|
36
|
+
# 平稳噪声抑制 stationary=True
|
|
37
|
+
reduced_noise = nr.reduce_noise(y=wav, sr=sr, stationary=True,
|
|
38
|
+
prop_decrease=prop_decrease, # 降噪噪声的比例
|
|
39
|
+
)
|
|
40
|
+
else:
|
|
41
|
+
# 非平稳噪声抑制 stationary=False
|
|
42
|
+
reduced_noise = nr.reduce_noise(y=wav, sr=sr, stationary=False,
|
|
43
|
+
prop_decrease=prop_decrease,
|
|
44
|
+
)
|
|
45
|
+
return reduced_noise
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def NS_test():
|
|
49
|
+
import soundfile as sf
|
|
50
|
+
sr = 16000
|
|
51
|
+
wav_path = "../../data/vad_example.wav"
|
|
52
|
+
wav, wav_sr = sf.read(wav_path, always_2d=False, dtype="float32")
|
|
53
|
+
wav_NS = NS(wav, sr=sr, stationary=True, prop_decrease=0.6)
|
|
54
|
+
sf.write("../../wav_data/000_short_NS.wav", wav_NS, samplerate=sr)
|
|
55
|
+
|
|
56
|
+
# 绘制降噪后的频谱图
|
|
57
|
+
import matplotlib.pyplot as plt
|
|
58
|
+
plt.subplot(211)
|
|
59
|
+
plt.specgram(wav, Fs=sr, scale_by_freq=True, sides='default', cmap="jet")
|
|
60
|
+
plt.subplot(212)
|
|
61
|
+
plt.specgram(wav_NS, Fs=sr, scale_by_freq=True, sides='default', cmap="jet")
|
|
62
|
+
plt.show()
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
if __name__ == "__main__":
|
|
66
|
+
NS_test()
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# -*- coding:utf-8 -*-
|
|
2
|
+
# Author:凌逆战 | Never
|
|
3
|
+
# Date: 2024/9/19
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
"""
|
|
7
|
+
import torch
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Silero_VAD_C():
|
|
11
|
+
def __init__(self, sr=16000, threshold=0.5, min_speech_duration_ms=10,
|
|
12
|
+
min_silence_duration_ms=140, window_size_samples=512, speech_pad_ms=0):
|
|
13
|
+
self.sr = sr
|
|
14
|
+
self.threshold = threshold
|
|
15
|
+
self.min_speech_duration_ms = min_speech_duration_ms # 语音块的最小持续时间 ms
|
|
16
|
+
self.min_silence_duration_ms = min_silence_duration_ms # 语音块之间的最小静音时间 ms
|
|
17
|
+
self.window_size_samples = window_size_samples # 512\1024\1536
|
|
18
|
+
self.speech_pad_ms = speech_pad_ms # 最后的语音块由两侧的speech_pad_ms填充
|
|
19
|
+
|
|
20
|
+
self.model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad', force_reload=False,
|
|
21
|
+
onnx=True)
|
|
22
|
+
(self.get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks) = utils
|
|
23
|
+
|
|
24
|
+
def process(self, wav):
|
|
25
|
+
assert wav.ndim == 1, f"wav shape为{wav.shape}, 期望1D"
|
|
26
|
+
speech_timestamps = self.get_speech_timestamps(wav, self.model,
|
|
27
|
+
sampling_rate=self.sr,
|
|
28
|
+
threshold=self.threshold,
|
|
29
|
+
min_speech_duration_ms=self.min_speech_duration_ms,
|
|
30
|
+
min_silence_duration_ms=self.min_silence_duration_ms,
|
|
31
|
+
window_size_samples=self.window_size_samples,
|
|
32
|
+
speech_pad_ms=self.speech_pad_ms,
|
|
33
|
+
)
|
|
34
|
+
return speech_timestamps
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
if __name__ == "__main__":
|
|
38
|
+
import soundfile as sf
|
|
39
|
+
from neverlib.vad.PreProcess import HPFilter, volume_norm
|
|
40
|
+
|
|
41
|
+
sr = 16000
|
|
42
|
+
wav_path = "../../data/vad_example.wav"
|
|
43
|
+
wav, wav_sr = sf.read(wav_path, always_2d=False, dtype="float32")
|
|
44
|
+
assert wav_sr == sr, f"音频采样率为{wav_sr}, 期望{sr}"
|
|
45
|
+
wav = HPFilter(wav, sr=sr, order=6, cutoff=100)
|
|
46
|
+
wav = volume_norm(wav)
|
|
47
|
+
|
|
48
|
+
vad = Silero_VAD_C()
|
|
49
|
+
vad_array = vad.process(wav)
|
|
50
|
+
print(vad_array)
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# -*- coding:utf-8 -*-
|
|
2
|
+
# Author:凌逆战 | Never
|
|
3
|
+
# Date: 2024/9/19
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
"""
|
|
7
|
+
import torch
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Silero_VAD_C():
|
|
11
|
+
def __init__(self, sr=16000, threshold=0.5, min_speech_duration_ms=10,
|
|
12
|
+
min_silence_duration_ms=140, window_size_samples=512, speech_pad_ms=0):
|
|
13
|
+
self.sr = sr
|
|
14
|
+
self.threshold = threshold
|
|
15
|
+
self.min_speech_duration_ms = min_speech_duration_ms # 语音块的最小持续时间 ms
|
|
16
|
+
self.min_silence_duration_ms = min_silence_duration_ms # 语音块之间的最小静音时间 ms
|
|
17
|
+
self.window_size_samples = window_size_samples # 512\1024\1536
|
|
18
|
+
self.speech_pad_ms = speech_pad_ms # 最后的语音块由两侧的speech_pad_ms填充
|
|
19
|
+
|
|
20
|
+
self.model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad', force_reload=False,
|
|
21
|
+
onnx=True)
|
|
22
|
+
(self.get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks) = utils
|
|
23
|
+
|
|
24
|
+
def process(self, wav):
|
|
25
|
+
assert wav.ndim == 1, f"wav shape为{wav.shape}, 期望1D"
|
|
26
|
+
speech_timestamps = self.get_speech_timestamps(wav, self.model,
|
|
27
|
+
sampling_rate=self.sr,
|
|
28
|
+
threshold=self.threshold,
|
|
29
|
+
min_speech_duration_ms=self.min_speech_duration_ms,
|
|
30
|
+
min_silence_duration_ms=self.min_silence_duration_ms,
|
|
31
|
+
window_size_samples=self.window_size_samples,
|
|
32
|
+
speech_pad_ms=self.speech_pad_ms,
|
|
33
|
+
)
|
|
34
|
+
return speech_timestamps
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
if __name__ == "__main__":
|
|
38
|
+
import soundfile as sf
|
|
39
|
+
from neverlib.vad.PreProcess import HPFilter, volume_norm
|
|
40
|
+
|
|
41
|
+
sr = 16000
|
|
42
|
+
wav_path = "../../data/vad_example.wav"
|
|
43
|
+
wav, wav_sr = sf.read(wav_path, always_2d=False, dtype="float32")
|
|
44
|
+
assert wav_sr == sr, f"音频采样率为{wav_sr}, 期望{sr}"
|
|
45
|
+
wav = HPFilter(wav, sr=sr, order=6, cutoff=100)
|
|
46
|
+
wav = volume_norm(wav)
|
|
47
|
+
|
|
48
|
+
vad = Silero_VAD_C()
|
|
49
|
+
vad_array = vad.process(wav)
|
|
50
|
+
print(vad_array)
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# -*- coding:utf-8 -*-
|
|
2
|
+
# Author:凌逆战 | Never
|
|
3
|
+
# Date: 2024/9/19
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
"""
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class WebRTC_VAD_C():
|
|
11
|
+
def __init__(self, sr=16000, window_len=10, mode=1):
|
|
12
|
+
"""
|
|
13
|
+
:param window_len: 窗长(ms)
|
|
14
|
+
:param mode:
|
|
15
|
+
"""
|
|
16
|
+
import webrtcvad
|
|
17
|
+
self.sr = sr
|
|
18
|
+
self.vad = webrtcvad.Vad()
|
|
19
|
+
self.vad.set_mode(mode) # 0~3
|
|
20
|
+
self.window_len = int(window_len / 1000 * sr)
|
|
21
|
+
|
|
22
|
+
def process(self, wav):
|
|
23
|
+
assert wav.ndim == 1, f"wav shape为{wav.shape}, 期望1D"
|
|
24
|
+
# float32 -> int16
|
|
25
|
+
wav_int16 = (wav * np.iinfo(np.int16).max).astype(np.int16)
|
|
26
|
+
wav_int16 = wav_int16[:len(wav_int16) - len(wav_int16) % self.window_len] # (105120, 1)
|
|
27
|
+
vad_array = np.zeros_like(wav_int16)
|
|
28
|
+
for i in range(0, len(wav_int16), self.window_len):
|
|
29
|
+
vad_flag = self.vad.is_speech(wav_int16[i:i + self.window_len].tobytes(), self.sr)
|
|
30
|
+
vad_array[i:i + self.window_len] = vad_flag
|
|
31
|
+
|
|
32
|
+
return vad_array
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
if __name__ == "__main__":
|
|
36
|
+
import soundfile as sf
|
|
37
|
+
import matplotlib.pyplot as plt
|
|
38
|
+
from neverlib.vad.PreProcess import HPFilter, volume_norm
|
|
39
|
+
|
|
40
|
+
sr = 16000
|
|
41
|
+
wav_path = "../../data/vad_example.wav"
|
|
42
|
+
wav, wav_sr = sf.read(wav_path, always_2d=False, dtype="float32")
|
|
43
|
+
assert wav_sr == sr, f"音频采样率为{wav_sr}, 期望{sr}"
|
|
44
|
+
wav = HPFilter(wav, sr=sr, order=6, cutoff=100)
|
|
45
|
+
wav = volume_norm(wav)
|
|
46
|
+
|
|
47
|
+
vad = WebRTC_VAD_C()
|
|
48
|
+
vad_array = vad.process(wav)
|
|
49
|
+
|
|
50
|
+
plt.figure(figsize=(20, 5))
|
|
51
|
+
plt.plot(wav)
|
|
52
|
+
plt.plot(vad_array)
|
|
53
|
+
plt.grid()
|
|
54
|
+
plt.show()
|
|
55
|
+
|
|
56
|
+
plt.figure(figsize=(20, 5))
|
|
57
|
+
plt.subplot(2, 1, 1)
|
|
58
|
+
plt.specgram(wav, Fs=sr, scale_by_freq=True, sides='default', cmap="jet")
|
|
59
|
+
plt.subplot(2, 1, 2)
|
|
60
|
+
plt.specgram(vad_array, Fs=sr, scale_by_freq=True, sides='default', cmap="jet")
|
|
61
|
+
plt.show()
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# -*- coding:utf-8 -*-
|
|
2
|
+
# Author:凌逆战 | Never
|
|
3
|
+
# Date: 2024/9/19
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
"""
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class WebRTC_VAD_C():
|
|
11
|
+
def __init__(self, sr=16000, window_len=10, mode=1):
|
|
12
|
+
"""
|
|
13
|
+
:param window_len: 窗长(ms)
|
|
14
|
+
:param mode:
|
|
15
|
+
"""
|
|
16
|
+
import webrtcvad
|
|
17
|
+
self.sr = sr
|
|
18
|
+
self.vad = webrtcvad.Vad()
|
|
19
|
+
self.vad.set_mode(mode) # 0~3
|
|
20
|
+
self.window_len = int(window_len / 1000 * sr)
|
|
21
|
+
|
|
22
|
+
def process(self, wav):
|
|
23
|
+
assert wav.ndim == 1, f"wav shape为{wav.shape}, 期望1D"
|
|
24
|
+
# float32 -> int16
|
|
25
|
+
wav_int16 = (wav * np.iinfo(np.int16).max).astype(np.int16)
|
|
26
|
+
wav_int16 = wav_int16[:len(wav_int16) - len(wav_int16) % self.window_len] # (105120, 1)
|
|
27
|
+
vad_array = np.zeros_like(wav_int16)
|
|
28
|
+
for i in range(0, len(wav_int16), self.window_len):
|
|
29
|
+
vad_flag = self.vad.is_speech(wav_int16[i:i + self.window_len].tobytes(), self.sr)
|
|
30
|
+
vad_array[i:i + self.window_len] = vad_flag
|
|
31
|
+
|
|
32
|
+
return vad_array
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
if __name__ == "__main__":
|
|
36
|
+
import soundfile as sf
|
|
37
|
+
import matplotlib.pyplot as plt
|
|
38
|
+
from neverlib.vad.PreProcess import HPFilter, volume_norm
|
|
39
|
+
|
|
40
|
+
sr = 16000
|
|
41
|
+
wav_path = "../../data/vad_example.wav"
|
|
42
|
+
wav, wav_sr = sf.read(wav_path, always_2d=False, dtype="float32")
|
|
43
|
+
assert wav_sr == sr, f"音频采样率为{wav_sr}, 期望{sr}"
|
|
44
|
+
wav = HPFilter(wav, sr=sr, order=6, cutoff=100)
|
|
45
|
+
wav = volume_norm(wav)
|
|
46
|
+
|
|
47
|
+
vad = WebRTC_VAD_C()
|
|
48
|
+
vad_array = vad.process(wav)
|
|
49
|
+
|
|
50
|
+
plt.figure(figsize=(20, 5))
|
|
51
|
+
plt.plot(wav)
|
|
52
|
+
plt.plot(vad_array)
|
|
53
|
+
plt.grid()
|
|
54
|
+
plt.show()
|
|
55
|
+
|
|
56
|
+
plt.figure(figsize=(20, 5))
|
|
57
|
+
plt.subplot(2, 1, 1)
|
|
58
|
+
plt.specgram(wav, Fs=sr, scale_by_freq=True, sides='default', cmap="jet")
|
|
59
|
+
plt.subplot(2, 1, 2)
|
|
60
|
+
plt.specgram(vad_array, Fs=sr, scale_by_freq=True, sides='default', cmap="jet")
|
|
61
|
+
plt.show()
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# -*- coding:utf-8 -*-
|
|
2
|
+
# Author:凌逆战 | Never
|
|
3
|
+
# Date: 2024/9/19
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
"""
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FunASR_VAD_C():
|
|
11
|
+
def __init__(self, sr=16000):
|
|
12
|
+
from funasr import AutoModel
|
|
13
|
+
self.sr = sr
|
|
14
|
+
self.model = AutoModel(model="fsmn-vad", model_revision="v2.0.4")
|
|
15
|
+
|
|
16
|
+
def process(self, wav):
|
|
17
|
+
assert wav.ndim == 1, f"wav shape为{wav.shape}, 期望1D"
|
|
18
|
+
res_list = self.model.generate(input=wav)
|
|
19
|
+
vad_array = np.zeros_like(wav)
|
|
20
|
+
for res in res_list:
|
|
21
|
+
for value_item in res["value"]:
|
|
22
|
+
beg, end = value_item
|
|
23
|
+
vad_array[int(beg * self.sr / 1000):int(end * self.sr / 1000)] = 1
|
|
24
|
+
|
|
25
|
+
return vad_array
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
if __name__ == "__main__":
|
|
29
|
+
import soundfile as sf
|
|
30
|
+
import matplotlib.pyplot as plt
|
|
31
|
+
from neverlib.vad.PreProcess import HPFilter, volume_norm
|
|
32
|
+
|
|
33
|
+
sr = 16000
|
|
34
|
+
wav_path = "../../data/vad_example.wav"
|
|
35
|
+
wav, wav_sr = sf.read(wav_path, always_2d=False, dtype="float32")
|
|
36
|
+
assert wav_sr == sr, f"音频采样率为{wav_sr}, 期望{sr}"
|
|
37
|
+
wav = HPFilter(wav, sr=sr, order=6, cutoff=100)
|
|
38
|
+
wav = volume_norm(wav)
|
|
39
|
+
|
|
40
|
+
vad = FunASR_VAD_C()
|
|
41
|
+
vad_array = vad.process(wav)
|
|
42
|
+
|
|
43
|
+
plt.figure(figsize=(20, 5))
|
|
44
|
+
plt.plot(wav)
|
|
45
|
+
plt.plot(vad_array)
|
|
46
|
+
plt.grid()
|
|
47
|
+
plt.show()
|
|
48
|
+
|
|
49
|
+
plt.figure(figsize=(20, 5))
|
|
50
|
+
plt.subplot(2, 1, 1)
|
|
51
|
+
plt.specgram(wav, Fs=sr, scale_by_freq=True, sides='default', cmap="jet")
|
|
52
|
+
plt.subplot(2, 1, 2)
|
|
53
|
+
plt.specgram(vad_array, Fs=sr, scale_by_freq=True, sides='default', cmap="jet")
|
|
54
|
+
plt.show()
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# -*- coding:utf-8 -*-
|
|
2
|
+
# Author:凌逆战 | Never
|
|
3
|
+
# Date: 2024/9/19
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
"""
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FunASR_VAD_C():
|
|
11
|
+
def __init__(self, sr=16000):
|
|
12
|
+
from funasr import AutoModel
|
|
13
|
+
self.sr = sr
|
|
14
|
+
self.model = AutoModel(model="fsmn-vad", model_revision="v2.0.4")
|
|
15
|
+
|
|
16
|
+
def process(self, wav):
|
|
17
|
+
assert wav.ndim == 1, f"wav shape为{wav.shape}, 期望1D"
|
|
18
|
+
res_list = self.model.generate(input=wav)
|
|
19
|
+
vad_array = np.zeros_like(wav)
|
|
20
|
+
for res in res_list:
|
|
21
|
+
for value_item in res["value"]:
|
|
22
|
+
beg, end = value_item
|
|
23
|
+
vad_array[int(beg * self.sr / 1000):int(end * self.sr / 1000)] = 1
|
|
24
|
+
|
|
25
|
+
return vad_array
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
if __name__ == "__main__":
|
|
29
|
+
import soundfile as sf
|
|
30
|
+
import matplotlib.pyplot as plt
|
|
31
|
+
from neverlib.vad.PreProcess import HPFilter, volume_norm
|
|
32
|
+
|
|
33
|
+
sr = 16000
|
|
34
|
+
wav_path = "../../data/vad_example.wav"
|
|
35
|
+
wav, wav_sr = sf.read(wav_path, always_2d=False, dtype="float32")
|
|
36
|
+
assert wav_sr == sr, f"音频采样率为{wav_sr}, 期望{sr}"
|
|
37
|
+
wav = HPFilter(wav, sr=sr, order=6, cutoff=100)
|
|
38
|
+
wav = volume_norm(wav)
|
|
39
|
+
|
|
40
|
+
vad = FunASR_VAD_C()
|
|
41
|
+
vad_array = vad.process(wav)
|
|
42
|
+
|
|
43
|
+
plt.figure(figsize=(20, 5))
|
|
44
|
+
plt.plot(wav)
|
|
45
|
+
plt.plot(vad_array)
|
|
46
|
+
plt.grid()
|
|
47
|
+
plt.show()
|
|
48
|
+
|
|
49
|
+
plt.figure(figsize=(20, 5))
|
|
50
|
+
plt.subplot(2, 1, 1)
|
|
51
|
+
plt.specgram(wav, Fs=sr, scale_by_freq=True, sides='default', cmap="jet")
|
|
52
|
+
plt.subplot(2, 1, 2)
|
|
53
|
+
plt.specgram(vad_array, Fs=sr, scale_by_freq=True, sides='default', cmap="jet")
|
|
54
|
+
plt.show()
|