neverlib 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- neverlib/.history/Docs/audio_aug/test_snr_20250806011311.py +0 -0
- neverlib/.history/Docs/audio_aug/test_snr_20250806011331.py +75 -0
- neverlib/.history/Docs/audio_aug/test_snr_20250806011342.py +57 -0
- neverlib/.history/Docs/audio_aug/test_snr_20250806011352.py +57 -0
- neverlib/.history/Docs/audio_aug/test_snr_20250806011403.py +57 -0
- neverlib/.history/Docs/audio_aug/test_snr_20250806011413.py +57 -0
- neverlib/.history/Docs/audio_aug/test_snr_20250806011435.py +55 -0
- neverlib/.history/Docs/vad/1_20250810032405.py +0 -0
- neverlib/.history/Docs/vad/1_20250810032417.py +39 -0
- neverlib/.history/audio_aug/audio_aug_20250806010451.py +125 -0
- neverlib/.history/audio_aug/audio_aug_20250806010750.py +138 -0
- neverlib/.history/audio_aug/audio_aug_20250806010759.py +140 -0
- neverlib/.history/audio_aug/audio_aug_20250806010803.py +140 -0
- neverlib/.history/audio_aug/audio_aug_20250806010809.py +140 -0
- neverlib/.history/audio_aug/audio_aug_20250806011108.py +140 -0
- neverlib/.history/dataAnalyze/__init___20250805234204.py +87 -0
- neverlib/.history/dataAnalyze/__init___20250806204125.py +14 -0
- neverlib/.history/dataAnalyze/__init___20250806204139.py +14 -0
- neverlib/.history/dataAnalyze/__init___20250806204159.py +14 -0
- neverlib/.history/filter/__init___20250820103351.py +70 -0
- neverlib/.history/filter/__init___20250821102348.py +70 -0
- neverlib/.history/filter/__init___20250821102405.py +14 -0
- neverlib/.history/filter/auto_eq/__init___20250819213121.py +36 -0
- neverlib/.history/filter/auto_eq/__init___20250821102241.py +36 -0
- neverlib/.history/filter/auto_eq/__init___20250821102259.py +36 -0
- neverlib/.history/filter/auto_eq/__init___20250821102307.py +36 -0
- neverlib/.history/filter/auto_eq/__init___20250821102310.py +36 -0
- neverlib/.history/filter/auto_eq/__init___20250821102318.py +36 -0
- neverlib/.history/filter/auto_eq/__init___20250821102507.py +36 -0
- neverlib/.history/filter/auto_eq/de_eq_20250820103848.py +361 -0
- neverlib/.history/filter/auto_eq/de_eq_20250821102422.py +360 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250805234206.py +75 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820140732.py +75 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820140745.py +75 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820140816.py +75 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820140938.py +77 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141003.py +77 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141006.py +77 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141019.py +77 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141049.py +77 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141211.py +77 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141227.py +77 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141311.py +78 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141340.py +78 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141712.py +78 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141733.py +78 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141755.py +78 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250821102434.py +76 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250821102500.py +76 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250821102502.py +76 -0
- neverlib/.history/filter/auto_eq/ga_eq_basic_20250820102957.py +380 -0
- neverlib/.history/filter/auto_eq/ga_eq_basic_20250820113054.py +380 -0
- neverlib/.history/filter/auto_eq/ga_eq_basic_20250820113150.py +380 -0
- neverlib/.history/filter/auto_eq/ga_eq_basic_20250820113520.py +385 -0
- neverlib/.history/filter/auto_eq/ga_eq_basic_20250820113525.py +385 -0
- neverlib/.history/filter/auto_eq/ga_eq_basic_20250821102212.py +385 -0
- neverlib/.history/metrics/dnsmos_20250806001612.py +160 -0
- neverlib/.history/metrics/dnsmos_20250815180659.py +160 -0
- neverlib/.history/metrics/dnsmos_20250815180701.py +158 -0
- neverlib/.history/metrics/dnsmos_20250815181321.py +154 -0
- neverlib/.history/metrics/dnsmos_20250815181327.py +154 -0
- neverlib/.history/metrics/dnsmos_20250815181331.py +154 -0
- neverlib/.history/metrics/dnsmos_20250815181620.py +154 -0
- neverlib/.history/metrics/dnsmos_20250815181631.py +154 -0
- neverlib/.history/metrics/dnsmos_20250815181742.py +154 -0
- neverlib/.history/metrics/dnsmos_20250815181824.py +153 -0
- neverlib/.history/metrics/dnsmos_20250815181834.py +153 -0
- neverlib/.history/metrics/dnsmos_20250815181922.py +153 -0
- neverlib/.history/metrics/dnsmos_20250815182011.py +147 -0
- neverlib/.history/metrics/dnsmos_20250815182036.py +144 -0
- neverlib/.history/metrics/dnsmos_20250815182936.py +143 -0
- neverlib/.history/metrics/dnsmos_20250815182942.py +143 -0
- neverlib/.history/metrics/dnsmos_20250815183032.py +137 -0
- neverlib/.history/metrics/dnsmos_20250815183101.py +144 -0
- neverlib/.history/metrics/dnsmos_20250815183121.py +144 -0
- neverlib/.history/metrics/dnsmos_20250815183123.py +143 -0
- neverlib/.history/metrics/dnsmos_20250815183214.py +143 -0
- neverlib/.history/metrics/dnsmos_20250815183240.py +143 -0
- neverlib/.history/metrics/dnsmos_20250815183248.py +144 -0
- neverlib/.history/metrics/dnsmos_20250815183407.py +142 -0
- neverlib/.history/metrics/dnsmos_20250815183409.py +142 -0
- neverlib/.history/metrics/dnsmos_20250815183431.py +142 -0
- neverlib/.history/metrics/dnsmos_20250815183507.py +140 -0
- neverlib/.history/metrics/dnsmos_20250815183513.py +139 -0
- neverlib/.history/metrics/dnsmos_20250815183618.py +139 -0
- neverlib/.history/metrics/dnsmos_20250815183709.py +140 -0
- neverlib/.history/metrics/dnsmos_20250815183756.py +137 -0
- neverlib/.history/metrics/dnsmos_20250815183815.py +128 -0
- neverlib/.history/metrics/dnsmos_20250815183827.py +129 -0
- neverlib/.history/metrics/dnsmos_20250815183913.py +117 -0
- neverlib/.history/metrics/dnsmos_20250815183914.py +117 -0
- neverlib/.history/metrics/dnsmos_20250815184003.py +118 -0
- neverlib/.history/metrics/dnsmos_20250815184040.py +118 -0
- neverlib/.history/metrics/dnsmos_20250815184049.py +118 -0
- neverlib/.history/metrics/dnsmos_20250815184104.py +117 -0
- neverlib/.history/metrics/dnsmos_20250815184200.py +117 -0
- neverlib/.history/metrics/lpc_lsp_metric_20250816015944.py +128 -0
- neverlib/.history/metrics/lpc_lsp_metric_20250816020142.py +128 -0
- neverlib/.history/metrics/lpc_lsp_metric_20250816020156.py +128 -0
- neverlib/.history/metrics/lpc_lsp_metric_20250816020554.py +130 -0
- neverlib/.history/metrics/lpc_lsp_metric_20250816020600.py +125 -0
- neverlib/.history/metrics/lpc_lsp_metric_20250816020631.py +120 -0
- neverlib/.history/metrics/lpc_lsp_metric_20250816020746.py +118 -0
- neverlib/.history/metrics/lpc_me_20250816013111.py +0 -0
- neverlib/.history/metrics/lpc_me_20250816013129.py +121 -0
- neverlib/.history/metrics/lpc_me_20250816015430.py +103 -0
- neverlib/.history/metrics/lpc_me_20250816015535.py +96 -0
- neverlib/.history/metrics/lpc_me_20250816015542.py +96 -0
- neverlib/.history/metrics/lpc_me_20250816015636.py +97 -0
- neverlib/.history/metrics/lpc_me_20250816015658.py +104 -0
- neverlib/.history/metrics/lpc_me_20250816015703.py +100 -0
- neverlib/.history/metrics/lpc_me_20250816015945.py +128 -0
- neverlib/.history/metrics/snr_20250806010538.py +177 -0
- neverlib/.history/metrics/snr_20250806211634.py +184 -0
- neverlib/.history/metrics/spec_20250805234209.py +45 -0
- neverlib/.history/metrics/spec_20250816135530.py +11 -0
- neverlib/.history/metrics/spec_20250816135654.py +16 -0
- neverlib/.history/metrics/spec_20250816135736.py +68 -0
- neverlib/.history/metrics/spec_20250816135904.py +75 -0
- neverlib/.history/metrics/spec_20250816135921.py +82 -0
- neverlib/.history/metrics/spec_20250816140111.py +82 -0
- neverlib/.history/metrics/spec_20250816140543.py +136 -0
- neverlib/.history/metrics/spec_20250816140559.py +172 -0
- neverlib/.history/metrics/spec_20250816140602.py +172 -0
- neverlib/.history/metrics/spec_20250816140608.py +172 -0
- neverlib/.history/metrics/spec_20250816140654.py +148 -0
- neverlib/.history/metrics/spec_20250816140705.py +144 -0
- neverlib/.history/metrics/spec_20250816140755.py +138 -0
- neverlib/.history/metrics/spec_20250816140823.py +170 -0
- neverlib/.history/metrics/spec_20250816140832.py +170 -0
- neverlib/.history/metrics/spec_20250816140833.py +170 -0
- neverlib/.history/metrics/spec_20250816140922.py +147 -0
- neverlib/.history/metrics/spec_20250816141148.py +107 -0
- neverlib/.history/metrics/spec_20250816141219.py +123 -0
- neverlib/.history/metrics/spec_20250816141732.py +178 -0
- neverlib/.history/metrics/spec_20250816141740.py +178 -0
- neverlib/.history/metrics/spec_20250816142030.py +178 -0
- neverlib/.history/metrics/spec_20250816142107.py +135 -0
- neverlib/.history/metrics/spec_20250816142126.py +135 -0
- neverlib/.history/metrics/spec_20250816142410.py +135 -0
- neverlib/.history/metrics/spec_20250816142415.py +136 -0
- neverlib/.history/metrics/spec_metric_20250816135156.py +0 -0
- neverlib/.history/metrics/spec_metric_20250816135226.py +5 -0
- neverlib/.history/metrics/spec_metric_20250816135227.py +10 -0
- neverlib/.history/metrics/spec_metric_20250816135306.py +15 -0
- neverlib/.history/metrics/spec_metric_20250816135442.py +31 -0
- neverlib/.history/metrics/spec_metric_20250816135448.py +31 -0
- neverlib/.history/metrics/spec_metric_20250816135520.py +29 -0
- neverlib/.history/metrics/spec_metric_20250816135537.py +63 -0
- neverlib/.history/metrics/spec_metric_20250816135653.py +65 -0
- neverlib/.history/vad/PreProcess_20250805234211.py +63 -0
- neverlib/.history/vad/PreProcess_20250809232455.py +63 -0
- neverlib/.history/vad/PreProcess_20250816020725.py +66 -0
- neverlib/.history/vad/VAD_Silero_20250805234211.py +50 -0
- neverlib/.history/vad/VAD_Silero_20250809232456.py +50 -0
- neverlib/.history/vad/VAD_WebRTC_20250805234211.py +61 -0
- neverlib/.history/vad/VAD_WebRTC_20250809232456.py +61 -0
- neverlib/.history/vad/VAD_funasr_20250805234211.py +54 -0
- neverlib/.history/vad/VAD_funasr_20250809232456.py +54 -0
- neverlib/.history/vad/VAD_vadlib_20250805234211.py +70 -0
- neverlib/.history/vad/VAD_vadlib_20250809232455.py +70 -0
- neverlib/.history/vad/VAD_whisper_20250805234211.py +55 -0
- neverlib/.history/vad/VAD_whisper_20250809232456.py +55 -0
- neverlib/.specstory/.what-is-this.md +69 -0
- neverlib/.specstory/history/2025-08-05_17-06Z-/350/277/231/344/270/200/346/255/245/347/232/204/347/233/256/347/232/204/346/230/257/344/273/200/344/271/210.md +424 -0
- neverlib/Docs/audio_aug/test_snr.py +55 -0
- neverlib/__init__.py +2 -2
- neverlib/audio_aug/HarmonicDistortion.py +79 -0
- neverlib/audio_aug/TFDrop.py +41 -0
- neverlib/audio_aug/TFMask.py +56 -0
- neverlib/audio_aug/__init__.py +1 -1
- neverlib/audio_aug/audio_aug.py +19 -5
- neverlib/audio_aug/clip_aug.py +41 -0
- neverlib/audio_aug/coder_aug.py +209 -0
- neverlib/audio_aug/coder_aug2.py +118 -0
- neverlib/audio_aug/loss_packet_aug.py +103 -0
- neverlib/audio_aug/quant_aug.py +78 -0
- neverlib/data_analyze/README.md +234 -0
- neverlib/data_analyze/__init__.py +14 -0
- neverlib/data_analyze/dataset_analyzer.py +590 -0
- neverlib/data_analyze/quality_metrics.py +364 -0
- neverlib/data_analyze/rms_distrubution.py +62 -0
- neverlib/data_analyze/spectral_analysis.py +218 -0
- neverlib/data_analyze/statistics.py +406 -0
- neverlib/data_analyze/temporal_features.py +126 -0
- neverlib/data_analyze/visualization.py +468 -0
- neverlib/filter/README.md +101 -0
- neverlib/filter/__init__.py +7 -0
- neverlib/filter/auto_eq/README.md +165 -0
- neverlib/filter/auto_eq/__init__.py +36 -0
- neverlib/filter/auto_eq/de_eq.py +360 -0
- neverlib/filter/auto_eq/freq_eq.py +76 -0
- neverlib/filter/auto_eq/ga_eq_advanced.py +577 -0
- neverlib/filter/auto_eq/ga_eq_basic.py +385 -0
- neverlib/filter/biquad.py +45 -0
- neverlib/filter/common.py +5 -6
- neverlib/filter/core.py +339 -0
- neverlib/metrics/dnsmos.py +117 -0
- neverlib/metrics/lpc_lsp.py +118 -0
- neverlib/metrics/snr.py +184 -0
- neverlib/metrics/spec.py +136 -0
- neverlib/metrics/test_pesq.py +35 -0
- neverlib/metrics/time.py +68 -0
- neverlib/tests/test_vad.py +21 -0
- neverlib/utils/audio_split.py +2 -1
- neverlib/utils/message.py +4 -4
- neverlib/utils/utils.py +36 -16
- neverlib/vad/PreProcess.py +6 -3
- neverlib/vad/README.md +10 -10
- neverlib/vad/VAD_Energy.py +1 -1
- neverlib/vad/VAD_Silero.py +2 -2
- neverlib/vad/VAD_WebRTC.py +2 -2
- neverlib/vad/VAD_funasr.py +2 -2
- neverlib/vad/VAD_statistics.py +3 -3
- neverlib/vad/VAD_vadlib.py +3 -3
- neverlib/vad/VAD_whisper.py +2 -2
- neverlib/vad/__init__.py +1 -1
- neverlib/vad/class_get_speech.py +4 -4
- neverlib/vad/class_vad.py +1 -1
- neverlib/vad/utils.py +47 -5
- {neverlib-0.2.2.dist-info → neverlib-0.2.4.dist-info}/METADATA +120 -120
- neverlib-0.2.4.dist-info/RECORD +229 -0
- {neverlib-0.2.2.dist-info → neverlib-0.2.4.dist-info}/WHEEL +1 -1
- neverlib/Documents/vad/VAD_Energy.ipynb +0 -159
- neverlib/Documents/vad/VAD_Silero.ipynb +0 -305
- neverlib/Documents/vad/VAD_WebRTC.ipynb +0 -183
- neverlib/Documents/vad/VAD_funasr.ipynb +0 -179
- neverlib/Documents/vad/VAD_ppasr.ipynb +0 -175
- neverlib/Documents/vad/VAD_statistics.ipynb +0 -522
- neverlib/Documents/vad/VAD_vadlib.ipynb +0 -184
- neverlib/Documents/vad/VAD_whisper.ipynb +0 -430
- neverlib/utils/waveform_analyzer.py +0 -51
- neverlib/wav_data/000_short.wav +0 -0
- neverlib-0.2.2.dist-info/RECORD +0 -40
- {neverlib-0.2.2.dist-info → neverlib-0.2.4.dist-info}/licenses/LICENSE +0 -0
- {neverlib-0.2.2.dist-info → neverlib-0.2.4.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Author: 凌逆战 | Never
|
|
3
|
+
Date: 2025-08-16 13:51:57
|
|
4
|
+
Description:
|
|
5
|
+
'''
|
|
6
|
+
|
|
7
|
+
import librosa
|
|
8
|
+
import numpy as np
|
|
9
|
+
import soundfile as sf
|
|
10
|
+
|
|
11
|
+
def lsd(reference, estimate, n_fft=2048, hop_length=512, win_length=None):
|
|
12
|
+
"""
|
|
13
|
+
计算两个一维音频信号之间的对数谱距离 (Log-Spectral Distance, LSD)。
|
|
14
|
+
该实现遵循标准的LSD定义: 整体均方根误差。
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
reference (np.ndarray): 原始的、干净的参考信号 (一维数组)。
|
|
18
|
+
estimate (np.ndarray): 模型估计或处理后的信号 (一维数组)。
|
|
19
|
+
n_fft (int): FFT点数, 决定了频率分辨率。
|
|
20
|
+
hop_length (int): 帧移, 决定了时间分辨率。
|
|
21
|
+
win_length (int, optional): 窗长。如果为None, 则默认为n_fft。
|
|
22
|
+
epsilon (float): 一个非常小的数值, 用于防止对零取对数, 保证数值稳定性。
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
float: 对数谱距离值, 单位为分贝 (dB)。
|
|
26
|
+
"""
|
|
27
|
+
assert reference.ndim == 1 and estimate.ndim == 1, "输入信号必须是一维数组。"
|
|
28
|
+
|
|
29
|
+
if win_length is None:
|
|
30
|
+
win_length = n_fft
|
|
31
|
+
|
|
32
|
+
reference_stft = librosa.stft(reference, n_fft=n_fft, hop_length=hop_length, win_length=win_length) # (F,T)
|
|
33
|
+
estimate_stft = librosa.stft(estimate, n_fft=n_fft, hop_length=hop_length, win_length=win_length) # (F,T)
|
|
34
|
+
|
|
35
|
+
reference_power_spec = np.abs(reference_stft) ** 2 # (F,T)
|
|
36
|
+
estimate_power_spec = np.abs(estimate_stft) ** 2 # (F,T)
|
|
37
|
+
|
|
38
|
+
reference_log_power_spec = 10 * np.log10(reference_power_spec + EPS)
|
|
39
|
+
estimate_log_power_spec = 10 * np.log10(estimate_power_spec + EPS)
|
|
40
|
+
|
|
41
|
+
squared_error = (reference_log_power_spec - estimate_log_power_spec) ** 2
|
|
42
|
+
lsd_val = np.sqrt(np.mean(squared_error))
|
|
43
|
+
|
|
44
|
+
return lsd_val
|
|
45
|
+
|
|
46
|
+
def mcd(ref_wav, test_wav, sr=16000):
|
|
47
|
+
"""
|
|
48
|
+
梅尔倒谱距离 Mel-Cepstral Distance
|
|
49
|
+
ref_spec: 参考频谱
|
|
50
|
+
test_spec: 测试频谱
|
|
51
|
+
"""
|
|
52
|
+
ref_wav, ref_sr = sf.read(ref_wav)
|
|
53
|
+
test_wav, test_sr = sf.read(test_wav)
|
|
54
|
+
assert ref_sr == test_sr == sr, "采样率必须为16000Hz"
|
|
55
|
+
assert len(ref_wav) == len(test_wav), "音频长度必须相同"
|
|
56
|
+
|
|
57
|
+
ref_mfcc = librosa.feature.mfcc(y=ref_wav, sr=sr)
|
|
58
|
+
test_mfcc = librosa.feature.mfcc(y=test_wav, sr=sr)
|
|
59
|
+
|
|
60
|
+
# 计算 MCD (跳过 0 阶)
|
|
61
|
+
diff = ref_mfcc[1:] - test_mfcc[1:]
|
|
62
|
+
mcd = (10.0 / np.log(10)) * np.sqrt(2 * np.mean(np.sum(diff ** 2, axis=0)))
|
|
63
|
+
return mcd
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Author: 凌逆战 | Never
|
|
3
|
+
Date: 2025-08-16 13:51:57
|
|
4
|
+
Description:
|
|
5
|
+
'''
|
|
6
|
+
|
|
7
|
+
import librosa
|
|
8
|
+
import numpy as np
|
|
9
|
+
import soundfile as sf
|
|
10
|
+
from utils import EPS
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def lsd(ref_wav, test_wav, n_fft=2048, hop_length=512, win_length=None):
|
|
14
|
+
"""
|
|
15
|
+
计算两个一维音频信号之间的对数谱距离 (Log-Spectral Distance, LSD)。
|
|
16
|
+
该实现遵循标准的LSD定义: 整体均方根误差。
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
ref_wav (np.ndarray): 原始的、干净的参考信号 (一维数组)。
|
|
20
|
+
test_wav (np.ndarray): 模型估计或处理后的信号 (一维数组)。
|
|
21
|
+
n_fft (int): FFT点数, 决定了频率分辨率。
|
|
22
|
+
hop_length (int): 帧移, 决定了时间分辨率。
|
|
23
|
+
win_length (int, optional): 窗长。如果为None, 则默认为n_fft。
|
|
24
|
+
epsilon (float): 一个非常小的数值, 用于防止对零取对数, 保证数值稳定性。
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
float: 对数谱距离值, 单位为分贝 (dB)。
|
|
28
|
+
"""
|
|
29
|
+
assert ref_wav.ndim == 1 and test_wav.ndim == 1, "输入信号必须是一维数组。"
|
|
30
|
+
|
|
31
|
+
if win_length is None:
|
|
32
|
+
win_length = n_fft
|
|
33
|
+
|
|
34
|
+
ref_stft = librosa.stft(ref_wav, n_fft=n_fft, hop_length=hop_length, win_length=win_length) # (F,T)
|
|
35
|
+
test_stft = librosa.stft(test_wav, n_fft=n_fft, hop_length=hop_length, win_length=win_length) # (F,T)
|
|
36
|
+
|
|
37
|
+
ref_power_spec = np.abs(ref_stft) ** 2 # (F,T)
|
|
38
|
+
test_power_spec = np.abs(test_stft) ** 2 # (F,T)
|
|
39
|
+
|
|
40
|
+
ref_log_power_spec = 10 * np.log10(ref_power_spec + EPS)
|
|
41
|
+
test_log_power_spec = 10 * np.log10(test_power_spec + EPS)
|
|
42
|
+
|
|
43
|
+
squared_error = (ref_log_power_spec - test_log_power_spec) ** 2
|
|
44
|
+
lsd_val = np.sqrt(np.mean(squared_error))
|
|
45
|
+
|
|
46
|
+
return lsd_val
|
|
47
|
+
|
|
48
|
+
def mcd(ref_wav, test_wav, sr=16000):
|
|
49
|
+
"""
|
|
50
|
+
梅尔倒谱距离 Mel-Cepstral Distance
|
|
51
|
+
ref_spec: 参考频谱
|
|
52
|
+
test_spec: 测试频谱
|
|
53
|
+
"""
|
|
54
|
+
ref_wav, ref_sr = sf.read(ref_wav)
|
|
55
|
+
test_wav, test_sr = sf.read(test_wav)
|
|
56
|
+
assert ref_sr == test_sr == sr, "采样率必须为16000Hz"
|
|
57
|
+
assert len(ref_wav) == len(test_wav), "音频长度必须相同"
|
|
58
|
+
|
|
59
|
+
ref_mfcc = librosa.feature.mfcc(y=ref_wav, sr=sr)
|
|
60
|
+
test_mfcc = librosa.feature.mfcc(y=test_wav, sr=sr)
|
|
61
|
+
|
|
62
|
+
# 计算 MCD (跳过 0 阶)
|
|
63
|
+
diff = ref_mfcc[1:] - test_mfcc[1:]
|
|
64
|
+
mcd = (10.0 / np.log(10)) * np.sqrt(2 * np.mean(np.sum(diff ** 2, axis=0)))
|
|
65
|
+
return mcd
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Author: 凌逆战 | Never
|
|
3
|
+
Date: 2025-02-13 20:06:07
|
|
4
|
+
LastEditTime: 2025-03-17 16:06:11
|
|
5
|
+
FilePath: \neverlib\vad\PreProcess.py
|
|
6
|
+
Description:
|
|
7
|
+
'''
|
|
8
|
+
# -*- coding:utf-8 -*-
|
|
9
|
+
# Author:凌逆战 | Never
|
|
10
|
+
# Date: 2024/9/14
|
|
11
|
+
"""
|
|
12
|
+
通过一些预处理方法, 来提高VAD的准确率
|
|
13
|
+
"""
|
|
14
|
+
import numpy as np
|
|
15
|
+
import noisereduce as nr
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def pre_emphasis(audio_data, alpha=0.97):
|
|
19
|
+
# y(n)=x(n)−α⋅x(n−1)
|
|
20
|
+
emphasized_audio = np.append(audio_data[0], audio_data[1:] - alpha * audio_data[:-1])
|
|
21
|
+
return emphasized_audio
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def NS(wav, sr=16000, stationary=True, prop_decrease=1.):
|
|
25
|
+
""" 传统降噪 Doc: https://pypi.org/project/noisereduce/
|
|
26
|
+
:param wav: (xxx,) or (channels, xxx)
|
|
27
|
+
:param sr: 采样率
|
|
28
|
+
:param stationary: 平稳降噪还是非平稳降噪
|
|
29
|
+
:param prop_decrease: 0~1, 降噪噪声百分比
|
|
30
|
+
:return:
|
|
31
|
+
"""
|
|
32
|
+
if stationary:
|
|
33
|
+
# 平稳噪声抑制 stationary=True
|
|
34
|
+
reduced_noise = nr.reduce_noise(y=wav, sr=sr, stationary=True,
|
|
35
|
+
prop_decrease=prop_decrease, # 降噪噪声的比例
|
|
36
|
+
)
|
|
37
|
+
else:
|
|
38
|
+
# 非平稳噪声抑制 stationary=False
|
|
39
|
+
reduced_noise = nr.reduce_noise(y=wav, sr=sr, stationary=False,
|
|
40
|
+
prop_decrease=prop_decrease,
|
|
41
|
+
)
|
|
42
|
+
return reduced_noise
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def NS_test():
|
|
46
|
+
import soundfile as sf
|
|
47
|
+
sr = 16000
|
|
48
|
+
wav_path = "../../data/vad_example.wav"
|
|
49
|
+
wav, wav_sr = sf.read(wav_path, always_2d=False, dtype="float32")
|
|
50
|
+
wav_NS = NS(wav, sr=sr, stationary=True, prop_decrease=0.6)
|
|
51
|
+
sf.write("../../wav_data/000_short_NS.wav", wav_NS, samplerate=sr)
|
|
52
|
+
|
|
53
|
+
# 绘制降噪后的频谱图
|
|
54
|
+
import matplotlib.pyplot as plt
|
|
55
|
+
plt.subplot(211)
|
|
56
|
+
plt.specgram(wav, Fs=sr, scale_by_freq=True, sides='default', cmap="jet")
|
|
57
|
+
plt.subplot(212)
|
|
58
|
+
plt.specgram(wav_NS, Fs=sr, scale_by_freq=True, sides='default', cmap="jet")
|
|
59
|
+
plt.show()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
if __name__ == "__main__":
|
|
63
|
+
NS_test()
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Author: 凌逆战 | Never
|
|
3
|
+
Date: 2025-02-13 20:06:07
|
|
4
|
+
LastEditTime: 2025-03-17 16:06:11
|
|
5
|
+
FilePath: \neverlib\vad\PreProcess.py
|
|
6
|
+
Description:
|
|
7
|
+
'''
|
|
8
|
+
# -*- coding:utf-8 -*-
|
|
9
|
+
# Author:凌逆战 | Never
|
|
10
|
+
# Date: 2024/9/14
|
|
11
|
+
"""
|
|
12
|
+
通过一些预处理方法, 来提高VAD的准确率
|
|
13
|
+
"""
|
|
14
|
+
import numpy as np
|
|
15
|
+
import noisereduce as nr
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def pre_emphasis(audio_data, alpha=0.97):
|
|
19
|
+
# y(n)=x(n)−α⋅x(n−1)
|
|
20
|
+
emphasized_audio = np.append(audio_data[0], audio_data[1:] - alpha * audio_data[:-1])
|
|
21
|
+
return emphasized_audio
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def NS(wav, sr=16000, stationary=True, prop_decrease=1.):
|
|
25
|
+
""" 传统降噪 Doc: https://pypi.org/project/noisereduce/
|
|
26
|
+
:param wav: (xxx,) or (channels, xxx)
|
|
27
|
+
:param sr: 采样率
|
|
28
|
+
:param stationary: 平稳降噪还是非平稳降噪
|
|
29
|
+
:param prop_decrease: 0~1, 降噪噪声百分比
|
|
30
|
+
:return:
|
|
31
|
+
"""
|
|
32
|
+
if stationary:
|
|
33
|
+
# 平稳噪声抑制 stationary=True
|
|
34
|
+
reduced_noise = nr.reduce_noise(y=wav, sr=sr, stationary=True,
|
|
35
|
+
prop_decrease=prop_decrease, # 降噪噪声的比例
|
|
36
|
+
)
|
|
37
|
+
else:
|
|
38
|
+
# 非平稳噪声抑制 stationary=False
|
|
39
|
+
reduced_noise = nr.reduce_noise(y=wav, sr=sr, stationary=False,
|
|
40
|
+
prop_decrease=prop_decrease,
|
|
41
|
+
)
|
|
42
|
+
return reduced_noise
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def NS_test():
|
|
46
|
+
import soundfile as sf
|
|
47
|
+
sr = 16000
|
|
48
|
+
wav_path = "../../data/vad_example.wav"
|
|
49
|
+
wav, wav_sr = sf.read(wav_path, always_2d=False, dtype="float32")
|
|
50
|
+
wav_NS = NS(wav, sr=sr, stationary=True, prop_decrease=0.6)
|
|
51
|
+
sf.write("../../wav_data/000_short_NS.wav", wav_NS, samplerate=sr)
|
|
52
|
+
|
|
53
|
+
# 绘制降噪后的频谱图
|
|
54
|
+
import matplotlib.pyplot as plt
|
|
55
|
+
plt.subplot(211)
|
|
56
|
+
plt.specgram(wav, Fs=sr, scale_by_freq=True, sides='default', cmap="jet")
|
|
57
|
+
plt.subplot(212)
|
|
58
|
+
plt.specgram(wav_NS, Fs=sr, scale_by_freq=True, sides='default', cmap="jet")
|
|
59
|
+
plt.show()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
if __name__ == "__main__":
|
|
63
|
+
NS_test()
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Author: 凌逆战 | Never
|
|
3
|
+
Date: 2025-02-13 20:06:07
|
|
4
|
+
LastEditTime: 2025-08-16 02:07:24
|
|
5
|
+
FilePath: \neverlib\vad\PreProcess.py
|
|
6
|
+
Description:
|
|
7
|
+
'''
|
|
8
|
+
# -*- coding:utf-8 -*-
|
|
9
|
+
# Author:凌逆战 | Never
|
|
10
|
+
# Date: 2024/9/14
|
|
11
|
+
"""
|
|
12
|
+
通过一些预处理方法, 来提高VAD的准确率
|
|
13
|
+
"""
|
|
14
|
+
import numpy as np
|
|
15
|
+
import noisereduce as nr
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def pre_emphasis(audio_data, alpha=0.97):
|
|
19
|
+
"""
|
|
20
|
+
预加重
|
|
21
|
+
"""
|
|
22
|
+
# y(n)=x(n)−α⋅x(n−1)
|
|
23
|
+
emphasized_audio = np.append(audio_data[0], audio_data[1:] - alpha * audio_data[:-1])
|
|
24
|
+
return emphasized_audio
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def NS(wav, sr=16000, stationary=True, prop_decrease=1.):
|
|
28
|
+
""" 传统降噪 Doc: https://pypi.org/project/noisereduce/
|
|
29
|
+
:param wav: (xxx,) or (channels, xxx)
|
|
30
|
+
:param sr: 采样率
|
|
31
|
+
:param stationary: 平稳降噪还是非平稳降噪
|
|
32
|
+
:param prop_decrease: 0~1, 降噪噪声百分比
|
|
33
|
+
:return:
|
|
34
|
+
"""
|
|
35
|
+
if stationary:
|
|
36
|
+
# 平稳噪声抑制 stationary=True
|
|
37
|
+
reduced_noise = nr.reduce_noise(y=wav, sr=sr, stationary=True,
|
|
38
|
+
prop_decrease=prop_decrease, # 降噪噪声的比例
|
|
39
|
+
)
|
|
40
|
+
else:
|
|
41
|
+
# 非平稳噪声抑制 stationary=False
|
|
42
|
+
reduced_noise = nr.reduce_noise(y=wav, sr=sr, stationary=False,
|
|
43
|
+
prop_decrease=prop_decrease,
|
|
44
|
+
)
|
|
45
|
+
return reduced_noise
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def NS_test():
|
|
49
|
+
import soundfile as sf
|
|
50
|
+
sr = 16000
|
|
51
|
+
wav_path = "../../data/vad_example.wav"
|
|
52
|
+
wav, wav_sr = sf.read(wav_path, always_2d=False, dtype="float32")
|
|
53
|
+
wav_NS = NS(wav, sr=sr, stationary=True, prop_decrease=0.6)
|
|
54
|
+
sf.write("../../wav_data/000_short_NS.wav", wav_NS, samplerate=sr)
|
|
55
|
+
|
|
56
|
+
# 绘制降噪后的频谱图
|
|
57
|
+
import matplotlib.pyplot as plt
|
|
58
|
+
plt.subplot(211)
|
|
59
|
+
plt.specgram(wav, Fs=sr, scale_by_freq=True, sides='default', cmap="jet")
|
|
60
|
+
plt.subplot(212)
|
|
61
|
+
plt.specgram(wav_NS, Fs=sr, scale_by_freq=True, sides='default', cmap="jet")
|
|
62
|
+
plt.show()
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
if __name__ == "__main__":
|
|
66
|
+
NS_test()
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# -*- coding:utf-8 -*-
|
|
2
|
+
# Author:凌逆战 | Never
|
|
3
|
+
# Date: 2024/9/19
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
"""
|
|
7
|
+
import torch
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Silero_VAD_C():
|
|
11
|
+
def __init__(self, sr=16000, threshold=0.5, min_speech_duration_ms=10,
|
|
12
|
+
min_silence_duration_ms=140, window_size_samples=512, speech_pad_ms=0):
|
|
13
|
+
self.sr = sr
|
|
14
|
+
self.threshold = threshold
|
|
15
|
+
self.min_speech_duration_ms = min_speech_duration_ms # 语音块的最小持续时间 ms
|
|
16
|
+
self.min_silence_duration_ms = min_silence_duration_ms # 语音块之间的最小静音时间 ms
|
|
17
|
+
self.window_size_samples = window_size_samples # 512\1024\1536
|
|
18
|
+
self.speech_pad_ms = speech_pad_ms # 最后的语音块由两侧的speech_pad_ms填充
|
|
19
|
+
|
|
20
|
+
self.model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad', force_reload=False,
|
|
21
|
+
onnx=True)
|
|
22
|
+
(self.get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks) = utils
|
|
23
|
+
|
|
24
|
+
def process(self, wav):
|
|
25
|
+
assert wav.ndim == 1, f"wav shape为{wav.shape}, 期望1D"
|
|
26
|
+
speech_timestamps = self.get_speech_timestamps(wav, self.model,
|
|
27
|
+
sampling_rate=self.sr,
|
|
28
|
+
threshold=self.threshold,
|
|
29
|
+
min_speech_duration_ms=self.min_speech_duration_ms,
|
|
30
|
+
min_silence_duration_ms=self.min_silence_duration_ms,
|
|
31
|
+
window_size_samples=self.window_size_samples,
|
|
32
|
+
speech_pad_ms=self.speech_pad_ms,
|
|
33
|
+
)
|
|
34
|
+
return speech_timestamps
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
if __name__ == "__main__":
|
|
38
|
+
import soundfile as sf
|
|
39
|
+
from neverlib.vad.PreProcess import HPFilter, volume_norm
|
|
40
|
+
|
|
41
|
+
sr = 16000
|
|
42
|
+
wav_path = "../../data/vad_example.wav"
|
|
43
|
+
wav, wav_sr = sf.read(wav_path, always_2d=False, dtype="float32")
|
|
44
|
+
assert wav_sr == sr, f"音频采样率为{wav_sr}, 期望{sr}"
|
|
45
|
+
wav = HPFilter(wav, sr=sr, order=6, cutoff=100)
|
|
46
|
+
wav = volume_norm(wav)
|
|
47
|
+
|
|
48
|
+
vad = Silero_VAD_C()
|
|
49
|
+
vad_array = vad.process(wav)
|
|
50
|
+
print(vad_array)
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# -*- coding:utf-8 -*-
|
|
2
|
+
# Author:凌逆战 | Never
|
|
3
|
+
# Date: 2024/9/19
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
"""
|
|
7
|
+
import torch
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Silero_VAD_C():
|
|
11
|
+
def __init__(self, sr=16000, threshold=0.5, min_speech_duration_ms=10,
|
|
12
|
+
min_silence_duration_ms=140, window_size_samples=512, speech_pad_ms=0):
|
|
13
|
+
self.sr = sr
|
|
14
|
+
self.threshold = threshold
|
|
15
|
+
self.min_speech_duration_ms = min_speech_duration_ms # 语音块的最小持续时间 ms
|
|
16
|
+
self.min_silence_duration_ms = min_silence_duration_ms # 语音块之间的最小静音时间 ms
|
|
17
|
+
self.window_size_samples = window_size_samples # 512\1024\1536
|
|
18
|
+
self.speech_pad_ms = speech_pad_ms # 最后的语音块由两侧的speech_pad_ms填充
|
|
19
|
+
|
|
20
|
+
self.model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad', force_reload=False,
|
|
21
|
+
onnx=True)
|
|
22
|
+
(self.get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks) = utils
|
|
23
|
+
|
|
24
|
+
def process(self, wav):
|
|
25
|
+
assert wav.ndim == 1, f"wav shape为{wav.shape}, 期望1D"
|
|
26
|
+
speech_timestamps = self.get_speech_timestamps(wav, self.model,
|
|
27
|
+
sampling_rate=self.sr,
|
|
28
|
+
threshold=self.threshold,
|
|
29
|
+
min_speech_duration_ms=self.min_speech_duration_ms,
|
|
30
|
+
min_silence_duration_ms=self.min_silence_duration_ms,
|
|
31
|
+
window_size_samples=self.window_size_samples,
|
|
32
|
+
speech_pad_ms=self.speech_pad_ms,
|
|
33
|
+
)
|
|
34
|
+
return speech_timestamps
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
if __name__ == "__main__":
|
|
38
|
+
import soundfile as sf
|
|
39
|
+
from neverlib.vad.PreProcess import HPFilter, volume_norm
|
|
40
|
+
|
|
41
|
+
sr = 16000
|
|
42
|
+
wav_path = "../../data/vad_example.wav"
|
|
43
|
+
wav, wav_sr = sf.read(wav_path, always_2d=False, dtype="float32")
|
|
44
|
+
assert wav_sr == sr, f"音频采样率为{wav_sr}, 期望{sr}"
|
|
45
|
+
wav = HPFilter(wav, sr=sr, order=6, cutoff=100)
|
|
46
|
+
wav = volume_norm(wav)
|
|
47
|
+
|
|
48
|
+
vad = Silero_VAD_C()
|
|
49
|
+
vad_array = vad.process(wav)
|
|
50
|
+
print(vad_array)
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# -*- coding:utf-8 -*-
|
|
2
|
+
# Author:凌逆战 | Never
|
|
3
|
+
# Date: 2024/9/19
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
"""
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class WebRTC_VAD_C():
|
|
11
|
+
def __init__(self, sr=16000, window_len=10, mode=1):
|
|
12
|
+
"""
|
|
13
|
+
:param window_len: 窗长(ms)
|
|
14
|
+
:param mode:
|
|
15
|
+
"""
|
|
16
|
+
import webrtcvad
|
|
17
|
+
self.sr = sr
|
|
18
|
+
self.vad = webrtcvad.Vad()
|
|
19
|
+
self.vad.set_mode(mode) # 0~3
|
|
20
|
+
self.window_len = int(window_len / 1000 * sr)
|
|
21
|
+
|
|
22
|
+
def process(self, wav):
|
|
23
|
+
assert wav.ndim == 1, f"wav shape为{wav.shape}, 期望1D"
|
|
24
|
+
# float32 -> int16
|
|
25
|
+
wav_int16 = (wav * np.iinfo(np.int16).max).astype(np.int16)
|
|
26
|
+
wav_int16 = wav_int16[:len(wav_int16) - len(wav_int16) % self.window_len] # (105120, 1)
|
|
27
|
+
vad_array = np.zeros_like(wav_int16)
|
|
28
|
+
for i in range(0, len(wav_int16), self.window_len):
|
|
29
|
+
vad_flag = self.vad.is_speech(wav_int16[i:i + self.window_len].tobytes(), self.sr)
|
|
30
|
+
vad_array[i:i + self.window_len] = vad_flag
|
|
31
|
+
|
|
32
|
+
return vad_array
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
if __name__ == "__main__":
|
|
36
|
+
import soundfile as sf
|
|
37
|
+
import matplotlib.pyplot as plt
|
|
38
|
+
from neverlib.vad.PreProcess import HPFilter, volume_norm
|
|
39
|
+
|
|
40
|
+
sr = 16000
|
|
41
|
+
wav_path = "../../data/vad_example.wav"
|
|
42
|
+
wav, wav_sr = sf.read(wav_path, always_2d=False, dtype="float32")
|
|
43
|
+
assert wav_sr == sr, f"音频采样率为{wav_sr}, 期望{sr}"
|
|
44
|
+
wav = HPFilter(wav, sr=sr, order=6, cutoff=100)
|
|
45
|
+
wav = volume_norm(wav)
|
|
46
|
+
|
|
47
|
+
vad = WebRTC_VAD_C()
|
|
48
|
+
vad_array = vad.process(wav)
|
|
49
|
+
|
|
50
|
+
plt.figure(figsize=(20, 5))
|
|
51
|
+
plt.plot(wav)
|
|
52
|
+
plt.plot(vad_array)
|
|
53
|
+
plt.grid()
|
|
54
|
+
plt.show()
|
|
55
|
+
|
|
56
|
+
plt.figure(figsize=(20, 5))
|
|
57
|
+
plt.subplot(2, 1, 1)
|
|
58
|
+
plt.specgram(wav, Fs=sr, scale_by_freq=True, sides='default', cmap="jet")
|
|
59
|
+
plt.subplot(2, 1, 2)
|
|
60
|
+
plt.specgram(vad_array, Fs=sr, scale_by_freq=True, sides='default', cmap="jet")
|
|
61
|
+
plt.show()
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# -*- coding:utf-8 -*-
|
|
2
|
+
# Author:凌逆战 | Never
|
|
3
|
+
# Date: 2024/9/19
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
"""
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class WebRTC_VAD_C():
|
|
11
|
+
def __init__(self, sr=16000, window_len=10, mode=1):
|
|
12
|
+
"""
|
|
13
|
+
:param window_len: 窗长(ms)
|
|
14
|
+
:param mode:
|
|
15
|
+
"""
|
|
16
|
+
import webrtcvad
|
|
17
|
+
self.sr = sr
|
|
18
|
+
self.vad = webrtcvad.Vad()
|
|
19
|
+
self.vad.set_mode(mode) # 0~3
|
|
20
|
+
self.window_len = int(window_len / 1000 * sr)
|
|
21
|
+
|
|
22
|
+
def process(self, wav):
|
|
23
|
+
assert wav.ndim == 1, f"wav shape为{wav.shape}, 期望1D"
|
|
24
|
+
# float32 -> int16
|
|
25
|
+
wav_int16 = (wav * np.iinfo(np.int16).max).astype(np.int16)
|
|
26
|
+
wav_int16 = wav_int16[:len(wav_int16) - len(wav_int16) % self.window_len] # (105120, 1)
|
|
27
|
+
vad_array = np.zeros_like(wav_int16)
|
|
28
|
+
for i in range(0, len(wav_int16), self.window_len):
|
|
29
|
+
vad_flag = self.vad.is_speech(wav_int16[i:i + self.window_len].tobytes(), self.sr)
|
|
30
|
+
vad_array[i:i + self.window_len] = vad_flag
|
|
31
|
+
|
|
32
|
+
return vad_array
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
if __name__ == "__main__":
|
|
36
|
+
import soundfile as sf
|
|
37
|
+
import matplotlib.pyplot as plt
|
|
38
|
+
from neverlib.vad.PreProcess import HPFilter, volume_norm
|
|
39
|
+
|
|
40
|
+
sr = 16000
|
|
41
|
+
wav_path = "../../data/vad_example.wav"
|
|
42
|
+
wav, wav_sr = sf.read(wav_path, always_2d=False, dtype="float32")
|
|
43
|
+
assert wav_sr == sr, f"音频采样率为{wav_sr}, 期望{sr}"
|
|
44
|
+
wav = HPFilter(wav, sr=sr, order=6, cutoff=100)
|
|
45
|
+
wav = volume_norm(wav)
|
|
46
|
+
|
|
47
|
+
vad = WebRTC_VAD_C()
|
|
48
|
+
vad_array = vad.process(wav)
|
|
49
|
+
|
|
50
|
+
plt.figure(figsize=(20, 5))
|
|
51
|
+
plt.plot(wav)
|
|
52
|
+
plt.plot(vad_array)
|
|
53
|
+
plt.grid()
|
|
54
|
+
plt.show()
|
|
55
|
+
|
|
56
|
+
plt.figure(figsize=(20, 5))
|
|
57
|
+
plt.subplot(2, 1, 1)
|
|
58
|
+
plt.specgram(wav, Fs=sr, scale_by_freq=True, sides='default', cmap="jet")
|
|
59
|
+
plt.subplot(2, 1, 2)
|
|
60
|
+
plt.specgram(vad_array, Fs=sr, scale_by_freq=True, sides='default', cmap="jet")
|
|
61
|
+
plt.show()
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# -*- coding:utf-8 -*-
|
|
2
|
+
# Author:凌逆战 | Never
|
|
3
|
+
# Date: 2024/9/19
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
"""
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FunASR_VAD_C():
|
|
11
|
+
def __init__(self, sr=16000):
|
|
12
|
+
from funasr import AutoModel
|
|
13
|
+
self.sr = sr
|
|
14
|
+
self.model = AutoModel(model="fsmn-vad", model_revision="v2.0.4")
|
|
15
|
+
|
|
16
|
+
def process(self, wav):
|
|
17
|
+
assert wav.ndim == 1, f"wav shape为{wav.shape}, 期望1D"
|
|
18
|
+
res_list = self.model.generate(input=wav)
|
|
19
|
+
vad_array = np.zeros_like(wav)
|
|
20
|
+
for res in res_list:
|
|
21
|
+
for value_item in res["value"]:
|
|
22
|
+
beg, end = value_item
|
|
23
|
+
vad_array[int(beg * self.sr / 1000):int(end * self.sr / 1000)] = 1
|
|
24
|
+
|
|
25
|
+
return vad_array
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
if __name__ == "__main__":
|
|
29
|
+
import soundfile as sf
|
|
30
|
+
import matplotlib.pyplot as plt
|
|
31
|
+
from neverlib.vad.PreProcess import HPFilter, volume_norm
|
|
32
|
+
|
|
33
|
+
sr = 16000
|
|
34
|
+
wav_path = "../../data/vad_example.wav"
|
|
35
|
+
wav, wav_sr = sf.read(wav_path, always_2d=False, dtype="float32")
|
|
36
|
+
assert wav_sr == sr, f"音频采样率为{wav_sr}, 期望{sr}"
|
|
37
|
+
wav = HPFilter(wav, sr=sr, order=6, cutoff=100)
|
|
38
|
+
wav = volume_norm(wav)
|
|
39
|
+
|
|
40
|
+
vad = FunASR_VAD_C()
|
|
41
|
+
vad_array = vad.process(wav)
|
|
42
|
+
|
|
43
|
+
plt.figure(figsize=(20, 5))
|
|
44
|
+
plt.plot(wav)
|
|
45
|
+
plt.plot(vad_array)
|
|
46
|
+
plt.grid()
|
|
47
|
+
plt.show()
|
|
48
|
+
|
|
49
|
+
plt.figure(figsize=(20, 5))
|
|
50
|
+
plt.subplot(2, 1, 1)
|
|
51
|
+
plt.specgram(wav, Fs=sr, scale_by_freq=True, sides='default', cmap="jet")
|
|
52
|
+
plt.subplot(2, 1, 2)
|
|
53
|
+
plt.specgram(vad_array, Fs=sr, scale_by_freq=True, sides='default', cmap="jet")
|
|
54
|
+
plt.show()
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# -*- coding:utf-8 -*-
|
|
2
|
+
# Author:凌逆战 | Never
|
|
3
|
+
# Date: 2024/9/19
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
"""
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FunASR_VAD_C():
|
|
11
|
+
def __init__(self, sr=16000):
|
|
12
|
+
from funasr import AutoModel
|
|
13
|
+
self.sr = sr
|
|
14
|
+
self.model = AutoModel(model="fsmn-vad", model_revision="v2.0.4")
|
|
15
|
+
|
|
16
|
+
def process(self, wav):
|
|
17
|
+
assert wav.ndim == 1, f"wav shape为{wav.shape}, 期望1D"
|
|
18
|
+
res_list = self.model.generate(input=wav)
|
|
19
|
+
vad_array = np.zeros_like(wav)
|
|
20
|
+
for res in res_list:
|
|
21
|
+
for value_item in res["value"]:
|
|
22
|
+
beg, end = value_item
|
|
23
|
+
vad_array[int(beg * self.sr / 1000):int(end * self.sr / 1000)] = 1
|
|
24
|
+
|
|
25
|
+
return vad_array
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
if __name__ == "__main__":
|
|
29
|
+
import soundfile as sf
|
|
30
|
+
import matplotlib.pyplot as plt
|
|
31
|
+
from neverlib.vad.PreProcess import HPFilter, volume_norm
|
|
32
|
+
|
|
33
|
+
sr = 16000
|
|
34
|
+
wav_path = "../../data/vad_example.wav"
|
|
35
|
+
wav, wav_sr = sf.read(wav_path, always_2d=False, dtype="float32")
|
|
36
|
+
assert wav_sr == sr, f"音频采样率为{wav_sr}, 期望{sr}"
|
|
37
|
+
wav = HPFilter(wav, sr=sr, order=6, cutoff=100)
|
|
38
|
+
wav = volume_norm(wav)
|
|
39
|
+
|
|
40
|
+
vad = FunASR_VAD_C()
|
|
41
|
+
vad_array = vad.process(wav)
|
|
42
|
+
|
|
43
|
+
plt.figure(figsize=(20, 5))
|
|
44
|
+
plt.plot(wav)
|
|
45
|
+
plt.plot(vad_array)
|
|
46
|
+
plt.grid()
|
|
47
|
+
plt.show()
|
|
48
|
+
|
|
49
|
+
plt.figure(figsize=(20, 5))
|
|
50
|
+
plt.subplot(2, 1, 1)
|
|
51
|
+
plt.specgram(wav, Fs=sr, scale_by_freq=True, sides='default', cmap="jet")
|
|
52
|
+
plt.subplot(2, 1, 2)
|
|
53
|
+
plt.specgram(vad_array, Fs=sr, scale_by_freq=True, sides='default', cmap="jet")
|
|
54
|
+
plt.show()
|