neverlib 0.2.2__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- neverlib/__init__.py +2 -2
- neverlib/audio_aug/__init__.py +1 -1
- neverlib/audio_aug/audio_aug.py +4 -5
- neverlib/dataAnalyze/README.md +234 -0
- neverlib/dataAnalyze/__init__.py +87 -0
- neverlib/dataAnalyze/dataset_analyzer.py +590 -0
- neverlib/dataAnalyze/quality_metrics.py +364 -0
- neverlib/dataAnalyze/rms_distrubution.py +62 -0
- neverlib/dataAnalyze/spectral_analysis.py +218 -0
- neverlib/dataAnalyze/statistics.py +406 -0
- neverlib/dataAnalyze/temporal_features.py +126 -0
- neverlib/dataAnalyze/visualization.py +468 -0
- neverlib/filter/AudoEQ/README.md +165 -0
- neverlib/filter/AudoEQ/auto_eq_de.py +361 -0
- neverlib/filter/AudoEQ/auto_eq_ga_advanced.py +577 -0
- neverlib/filter/AudoEQ/auto_eq_ga_basic.py +380 -0
- neverlib/filter/AudoEQ/auto_eq_spectral_direct.py +75 -0
- neverlib/filter/README.md +101 -0
- neverlib/filter/__init__.py +7 -0
- neverlib/filter/biquad.py +45 -0
- neverlib/filter/common.py +5 -6
- neverlib/filter/core.py +339 -0
- neverlib/metrics/dnsmos.py +160 -0
- neverlib/metrics/snr.py +177 -0
- neverlib/metrics/spec.py +45 -0
- neverlib/metrics/test_pesq.py +35 -0
- neverlib/metrics/time.py +68 -0
- neverlib/tests/test_vad.py +21 -0
- neverlib/utils/audio_split.py +2 -1
- neverlib/utils/message.py +4 -4
- neverlib/utils/utils.py +32 -15
- neverlib/vad/PreProcess.py +1 -1
- neverlib/vad/README.md +10 -10
- neverlib/vad/VAD_Energy.py +1 -1
- neverlib/vad/VAD_Silero.py +1 -1
- neverlib/vad/VAD_WebRTC.py +1 -1
- neverlib/vad/VAD_funasr.py +1 -1
- neverlib/vad/VAD_statistics.py +3 -3
- neverlib/vad/VAD_vadlib.py +2 -2
- neverlib/vad/VAD_whisper.py +1 -1
- neverlib/vad/__init__.py +1 -1
- neverlib/vad/class_get_speech.py +4 -4
- neverlib/vad/class_vad.py +1 -1
- neverlib/vad/utils.py +47 -5
- {neverlib-0.2.2.dist-info → neverlib-0.2.3.dist-info}/METADATA +120 -120
- neverlib-0.2.3.dist-info/RECORD +53 -0
- {neverlib-0.2.2.dist-info → neverlib-0.2.3.dist-info}/WHEEL +1 -1
- neverlib/Documents/vad/VAD_Energy.ipynb +0 -159
- neverlib/Documents/vad/VAD_Silero.ipynb +0 -305
- neverlib/Documents/vad/VAD_WebRTC.ipynb +0 -183
- neverlib/Documents/vad/VAD_funasr.ipynb +0 -179
- neverlib/Documents/vad/VAD_ppasr.ipynb +0 -175
- neverlib/Documents/vad/VAD_statistics.ipynb +0 -522
- neverlib/Documents/vad/VAD_vadlib.ipynb +0 -184
- neverlib/Documents/vad/VAD_whisper.ipynb +0 -430
- neverlib/utils/waveform_analyzer.py +0 -51
- neverlib/wav_data/000_short.wav +0 -0
- neverlib-0.2.2.dist-info/RECORD +0 -40
- {neverlib-0.2.2.dist-info → neverlib-0.2.3.dist-info}/licenses/LICENSE +0 -0
- {neverlib-0.2.2.dist-info → neverlib-0.2.3.dist-info}/top_level.txt +0 -0
neverlib/metrics/spec.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""
|
|
2
|
+
频域的客观评价指标
|
|
3
|
+
LSD: 对数谱距离
|
|
4
|
+
"""
|
|
5
|
+
import sys
|
|
6
|
+
sys.path.append("..")
|
|
7
|
+
import numpy as np
|
|
8
|
+
import librosa
|
|
9
|
+
from utils import EPS
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def lsd(reference, estimate, n_fft=2048, hop_length=512, win_length=None):
|
|
13
|
+
"""
|
|
14
|
+
计算两个一维音频信号之间的对数谱距离 (Log-Spectral Distance, LSD)。
|
|
15
|
+
该实现遵循标准的LSD定义: 整体均方根误差。
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
reference (np.ndarray): 原始的、干净的参考信号 (一维数组)。
|
|
19
|
+
estimate (np.ndarray): 模型估计或处理后的信号 (一维数组)。
|
|
20
|
+
n_fft (int): FFT点数, 决定了频率分辨率。
|
|
21
|
+
hop_length (int): 帧移, 决定了时间分辨率。
|
|
22
|
+
win_length (int, optional): 窗长。如果为None, 则默认为n_fft。
|
|
23
|
+
epsilon (float): 一个非常小的数值, 用于防止对零取对数, 保证数值稳定性。
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
float: 对数谱距离值, 单位为分贝 (dB)。
|
|
27
|
+
"""
|
|
28
|
+
assert reference.ndim == 1 and estimate.ndim == 1, "输入信号必须是一维数组。"
|
|
29
|
+
|
|
30
|
+
if win_length is None:
|
|
31
|
+
win_length = n_fft
|
|
32
|
+
|
|
33
|
+
reference_stft = librosa.stft(reference, n_fft=n_fft, hop_length=hop_length, win_length=win_length) # (F,T)
|
|
34
|
+
estimate_stft = librosa.stft(estimate, n_fft=n_fft, hop_length=hop_length, win_length=win_length) # (F,T)
|
|
35
|
+
|
|
36
|
+
reference_power_spec = np.abs(reference_stft) ** 2 # (F,T)
|
|
37
|
+
estimate_power_spec = np.abs(estimate_stft) ** 2 # (F,T)
|
|
38
|
+
|
|
39
|
+
reference_log_power_spec = 10 * np.log10(reference_power_spec + EPS)
|
|
40
|
+
estimate_log_power_spec = 10 * np.log10(estimate_power_spec + EPS)
|
|
41
|
+
|
|
42
|
+
squared_error = (reference_log_power_spec - estimate_log_power_spec) ** 2
|
|
43
|
+
lsd_val = np.sqrt(np.mean(squared_error))
|
|
44
|
+
|
|
45
|
+
return lsd_val
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Author: 凌逆战 | Never
|
|
3
|
+
Date: 2025-08-05 23:37:31
|
|
4
|
+
Description:
|
|
5
|
+
|
|
6
|
+
PESQ 包含 3 种类型的值:NB PESQ MOS、NB MOS LQO、WB MOS LQO。此包仅返回NB PESQ MOS代表 的Raw MOS分数narrowband handset listening。
|
|
7
|
+
'''
|
|
8
|
+
import pesq
|
|
9
|
+
import pypesq
|
|
10
|
+
import librosa
|
|
11
|
+
import os
|
|
12
|
+
import numpy as np
|
|
13
|
+
|
|
14
|
+
fs = 16000
|
|
15
|
+
clean = librosa.load("../data/000_short.wav", sr=fs)[0]
|
|
16
|
+
enhance = librosa.load("../data/000_short_enhance.wav", sr=fs)[0]
|
|
17
|
+
|
|
18
|
+
print(pesq.pesq(fs, clean, enhance, 'wb')) # 3.5920536518096924
|
|
19
|
+
print(pypesq.pesq(clean, enhance, fs=fs)) # 3.817176103591919
|
|
20
|
+
# os.system("./pesq_c/PESQ +16000 ../data/000_short.wav ../data/000_short_enhance.wav") # WB PESQ_MOS = 3.518
|
|
21
|
+
# os.system("./pesq_c/PESQ +8000 ../data/000_short.wav ../data/000_short_enhance.wav") # NB PESQ_MOS = 3.477
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def pesq2mos(pesq):
|
|
25
|
+
""" 将PESQ值[-0.5, 4.5]映射到MOS-LQO得分[1, 4.5]上,映射函数来源于:P.862.1 """
|
|
26
|
+
return 0.999 + (4.999 - 0.999) / (1 + np.exp(-1.4945 * pesq + 4.6607))
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def mos2pesq(mos):
|
|
30
|
+
""" 将MOS-LQO得分[1, 4.5]映射到PESQ值[-0.5, 4.5]上,映射函数来源于:P.862.1"""
|
|
31
|
+
inlog = (4.999 - mos) / (mos - 0.999)
|
|
32
|
+
return (4.6607 - np.log(inlog)) / 1.4945
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# print(mos2pesq(3.518))
|
neverlib/metrics/time.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Author: 凌逆战 | Never
|
|
3
|
+
Date: 2025-08-05 16:44:41
|
|
4
|
+
Description:
|
|
5
|
+
'''
|
|
6
|
+
"""
|
|
7
|
+
音频数据分析基础工具模块
|
|
8
|
+
Basic Utilities for Audio Data Analysis
|
|
9
|
+
|
|
10
|
+
提供音频分析的基础工具函数
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
import librosa
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def peak_amplitude(wav):
|
|
18
|
+
"""计算峰值幅度
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
wav: 音频信号 (*, ch)
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
峰值幅度 (dB)
|
|
25
|
+
"""
|
|
26
|
+
peak_amp = np.max(np.abs(wav))
|
|
27
|
+
return peak_amp
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def rms_amplitude(wav):
|
|
31
|
+
"""计算RMS幅度
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
wav: 音频信号 (*, ch)
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
RMS幅度
|
|
38
|
+
"""
|
|
39
|
+
return np.sqrt(np.mean(np.square(wav)))
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def mean_rms_amplitude(wav, frame_length=512, hop_length=256):
|
|
43
|
+
"""计算分帧平均RMS幅度
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
wav: 音频信号 (*, ch)
|
|
47
|
+
frame_length: 帧长度
|
|
48
|
+
hop_length: 跳跃长度
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
平均RMS幅度
|
|
52
|
+
"""
|
|
53
|
+
# 分帧
|
|
54
|
+
frame = librosa.util.frame(wav.flatten(), frame_length=frame_length, hop_length=hop_length)
|
|
55
|
+
rms_amp = np.sqrt(np.mean(np.square(frame), axis=0))
|
|
56
|
+
return np.mean(rms_amp)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def dc_offset(wav):
|
|
60
|
+
"""计算直流分量
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
wav: 音频信号 (*, ch)
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
直流分量
|
|
67
|
+
"""
|
|
68
|
+
return np.mean(wav)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Author: 凌逆战 | Never
|
|
3
|
+
Date: 2025-08-05 17:29:43
|
|
4
|
+
Description:
|
|
5
|
+
'''
|
|
6
|
+
import os
|
|
7
|
+
import sys
|
|
8
|
+
sys.path.append("../")
|
|
9
|
+
from vad.utils import vad2nad
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_vad2nad():
|
|
13
|
+
"""测试vad2nad函数"""
|
|
14
|
+
vad = [{'start': 100, 'end': 1000}, {'start': 2000, 'end': 3000}]
|
|
15
|
+
total_length = 4000
|
|
16
|
+
nad = vad2nad(vad, total_length)
|
|
17
|
+
print(nad)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
if __name__ == "__main__":
|
|
21
|
+
test_vad2nad()
|
neverlib/utils/audio_split.py
CHANGED
|
@@ -14,7 +14,7 @@ from pydub import AudioSegment
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def audio_split_ffmpeg(source_path, target_path, sr, channel_num, duration, endwith="*.pcm"):
|
|
17
|
-
"""
|
|
17
|
+
""" 切割音频切不准, 会留点尾巴0.016s
|
|
18
18
|
使用ffmpeg分割音频, 分割为短音频(单位:秒), 似乎无法非常准确的分割到指定长度
|
|
19
19
|
:param source_path: 源音频路径
|
|
20
20
|
:param target_path: 目标音频路径
|
|
@@ -94,6 +94,7 @@ def audio_split_np(source_path, target_path, sr, channel_num, duration, endwith=
|
|
|
94
94
|
|
|
95
95
|
# 注意读取时使用正确的dtype(例如int16表示16位PCM)
|
|
96
96
|
pcm_data = np.fromfile(wav_path, dtype=np.int16)
|
|
97
|
+
pcm_data = pcm_data[:(len(pcm_data) // channel_num) * channel_num]
|
|
97
98
|
pcm_data = pcm_data.reshape(-1, channel_num)
|
|
98
99
|
|
|
99
100
|
# 计算分割的数量
|
neverlib/utils/message.py
CHANGED
|
@@ -57,7 +57,7 @@ def send_QQEmail_with_images(title, content, from_name, from_email, from_passwor
|
|
|
57
57
|
:param from_email: 发件人邮箱
|
|
58
58
|
:param from_password: 发件人邮箱SMTP授权码
|
|
59
59
|
:param to_email: 收件人邮箱
|
|
60
|
-
:param image_paths:
|
|
60
|
+
:param image_paths: 图片文件路径列表, 应为PNG格式
|
|
61
61
|
"""
|
|
62
62
|
# 设置邮箱的域名
|
|
63
63
|
HOST = "smtp.qq.com"
|
|
@@ -87,10 +87,10 @@ def send_QQEmail_with_images(title, content, from_name, from_email, from_passwor
|
|
|
87
87
|
# 使用 MIMEImage 添加图片
|
|
88
88
|
image_part = MIMEImage(img_data)
|
|
89
89
|
|
|
90
|
-
# 设置Content-ID
|
|
90
|
+
# 设置Content-ID, 以便在正文中引用图片
|
|
91
91
|
image_part.add_header('Content-ID', cid)
|
|
92
92
|
|
|
93
|
-
# 设置为 inline
|
|
93
|
+
# 设置为 inline 显示, 避免附件处理
|
|
94
94
|
image_part.add_header('Content-Disposition', 'inline', filename=os.path.basename(image_path))
|
|
95
95
|
|
|
96
96
|
# 添加图片到邮件
|
|
@@ -115,7 +115,7 @@ def send_QQEmail_with_images(title, content, from_name, from_email, from_passwor
|
|
|
115
115
|
|
|
116
116
|
|
|
117
117
|
if __name__ == "__main__":
|
|
118
|
-
send_QQEmail("实验跑完", "
|
|
118
|
+
send_QQEmail("实验跑完", "实验跑完了, 快去看看吧!",
|
|
119
119
|
from_email="1786088386@qq.com", from_password="xxxx",
|
|
120
120
|
to_email="1786088386@qq.com")
|
|
121
121
|
pass
|
neverlib/utils/utils.py
CHANGED
|
@@ -12,6 +12,7 @@ from tqdm import tqdm
|
|
|
12
12
|
from datetime import datetime
|
|
13
13
|
import soundfile as sf
|
|
14
14
|
import numpy as np
|
|
15
|
+
EPS = np.finfo(float).eps
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
def get_path_list(source_path, end="*.wav", shuffle=False):
|
|
@@ -49,10 +50,10 @@ def rename_files_and_folders(directory, replace='_-', replacement='_'):
|
|
|
49
50
|
def get_file_time(file_path):
|
|
50
51
|
# 获取最后修改时间
|
|
51
52
|
mod_time = os.path.getmtime(file_path)
|
|
52
|
-
# 转为data_time
|
|
53
|
+
# 转为data_time格式: 年-月-日-时-分-秒
|
|
53
54
|
datetime_dt = datetime.fromtimestamp(mod_time)
|
|
54
55
|
|
|
55
|
-
# 如果时间早于2024-09-04 02:00:00
|
|
56
|
+
# 如果时间早于2024-09-04 02:00:00, 则删除
|
|
56
57
|
# if datetime_dt < datetime(2024, 9, 4, 2, 0, 0):
|
|
57
58
|
# print(file_path)
|
|
58
59
|
return datetime_dt
|
|
@@ -97,38 +98,38 @@ def TrainValTestSplit(dataset_dir, train_dir, val_dir, test_dir, percentage=[0.8
|
|
|
97
98
|
:param percentage: 分割百分比
|
|
98
99
|
"""
|
|
99
100
|
assert sum(percentage) == 1.0, "百分比总和必须等于1.0"
|
|
100
|
-
|
|
101
|
+
|
|
101
102
|
wav_path_list = sorted(get_path_list(dataset_dir, end="*.wav"))
|
|
102
103
|
random.seed(10086)
|
|
103
104
|
random.shuffle(wav_path_list) # 打乱列表的顺序
|
|
104
105
|
total_wav_num = len(wav_path_list)
|
|
105
|
-
|
|
106
|
+
|
|
106
107
|
# 计算训练集、验证集和测试集的分割点
|
|
107
108
|
train_split_idx = int(total_wav_num * percentage[0])
|
|
108
109
|
val_split_idx = train_split_idx + int(total_wav_num * percentage[1])
|
|
109
|
-
|
|
110
|
+
|
|
110
111
|
train_path_list = wav_path_list[:train_split_idx]
|
|
111
112
|
val_path_list = wav_path_list[train_split_idx:val_split_idx]
|
|
112
113
|
test_path_list = wav_path_list[val_split_idx:]
|
|
113
|
-
|
|
114
|
+
|
|
114
115
|
for train_wavpath in tqdm(train_path_list, desc="复制训练集音频"):
|
|
115
116
|
target_path = train_wavpath.replace(dataset_dir, train_dir)
|
|
116
117
|
if not os.path.exists(os.path.split(target_path)[0]):
|
|
117
118
|
os.makedirs(os.path.split(target_path)[0])
|
|
118
119
|
shutil.copy(train_wavpath, target_path)
|
|
119
|
-
|
|
120
|
+
|
|
120
121
|
for val_wavpath in tqdm(val_path_list, desc="复制验证集音频"):
|
|
121
122
|
target_path = val_wavpath.replace(dataset_dir, val_dir)
|
|
122
123
|
if not os.path.exists(os.path.split(target_path)[0]):
|
|
123
124
|
os.makedirs(os.path.split(target_path)[0])
|
|
124
125
|
shutil.copy(val_wavpath, target_path)
|
|
125
|
-
|
|
126
|
+
|
|
126
127
|
for test_wavpath in tqdm(test_path_list, desc="复制测试集音频"):
|
|
127
128
|
target_path = test_wavpath.replace(dataset_dir, test_dir)
|
|
128
129
|
if not os.path.exists(os.path.split(target_path)[0]):
|
|
129
130
|
os.makedirs(os.path.split(target_path)[0])
|
|
130
131
|
shutil.copy(test_wavpath, target_path)
|
|
131
|
-
|
|
132
|
+
|
|
132
133
|
print(f"完成! 训练集: {len(train_path_list)}个文件, 验证集: {len(val_path_list)}个文件, 测试集: {len(test_path_list)}个文件")
|
|
133
134
|
|
|
134
135
|
|
|
@@ -141,15 +142,31 @@ def get_leaf_folders(directory):
|
|
|
141
142
|
return leaf_folders
|
|
142
143
|
|
|
143
144
|
|
|
145
|
+
def del_empty_folders(path):
|
|
146
|
+
""" 递归删除空文件夹"""
|
|
147
|
+
assert os.path.isdir(path), f"{path} 不是一个有效的目录"
|
|
148
|
+
|
|
149
|
+
# os.walk(path, topdown=False) 会从最深的子目录开始向上遍历
|
|
150
|
+
for dirpath, dirnames, filenames in os.walk(path, topdown=False):
|
|
151
|
+
# 如果一个目录内既没有子目录, 也没有文件, 说明它是空的
|
|
152
|
+
if not dirnames and not filenames:
|
|
153
|
+
try:
|
|
154
|
+
os.rmdir(dirpath)
|
|
155
|
+
print(f"删除空文件夹: {dirpath}")
|
|
156
|
+
except OSError as e:
|
|
157
|
+
# 捕获可能的错误, 如权限不足或目录不是真的“空”(例如有隐藏文件)
|
|
158
|
+
print(f"删除失败: {dirpath} - {e}")
|
|
159
|
+
|
|
160
|
+
|
|
144
161
|
def DatasetSubfloderSplit(source_dir, split_dirs, percentage=None):
|
|
145
162
|
"""
|
|
146
163
|
将一个数据集按照子文件夹数量分割成train/val/test数据集
|
|
147
164
|
Args:
|
|
148
165
|
source_dir (str): 源数据集目录
|
|
149
|
-
split_dirs (list):
|
|
150
|
-
percentage (list, optional):
|
|
151
|
-
-
|
|
152
|
-
-
|
|
166
|
+
split_dirs (list): 目标目录列表, 如 [train_dir, val_dir] 或 [train_dir, val_dir, test_dir]
|
|
167
|
+
percentage (list, optional): 分割比例, 如 [0.9, 0.1] 或 [0.8, 0.1, 0.1]。默认为 None, 此时:
|
|
168
|
+
- 如果是两路分割, 默认为 [0.9, 0.1]
|
|
169
|
+
- 如果是三路分割, 默认为 [0.8, 0.1, 0.1]
|
|
153
170
|
Example:
|
|
154
171
|
# 两路分割示例
|
|
155
172
|
DatasetSplit(
|
|
@@ -247,11 +264,11 @@ def save_weight_histogram(model, save_dir, mode=["params", "buffers"], ignore_na
|
|
|
247
264
|
Args:
|
|
248
265
|
model: PyTorch模型
|
|
249
266
|
save_dir: 保存路径
|
|
250
|
-
mode:
|
|
267
|
+
mode: 保存模式, 可选值为["params", "buffers"]
|
|
251
268
|
bins: 直方图bin数量
|
|
252
269
|
"""
|
|
253
270
|
import matplotlib.pyplot as plt
|
|
254
|
-
#
|
|
271
|
+
# 如果路径存在, 则删除
|
|
255
272
|
if os.path.exists(save_dir):
|
|
256
273
|
shutil.rmtree(save_dir)
|
|
257
274
|
|
neverlib/vad/PreProcess.py
CHANGED
neverlib/vad/README.md
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
## energy-vad
|
|
2
2
|
|
|
3
3
|
https://pypi.org/project/energy-vad/
|
|
4
|
-
|
|
4
|
+
误差比较大, 而且连着的语音没必要分割, 但是该方法还是分割了
|
|
5
5
|
|
|
6
6
|
## Funasr
|
|
7
7
|
|
|
@@ -23,12 +23,12 @@ API文档:https://github.com/snakers4/silero-vad/blob/master/utils_vad.py
|
|
|
23
23
|
|
|
24
24
|
https://github.com/eesungkim/Voice_Activity_Detector
|
|
25
25
|
基于统计方法的VAD, 效果还可以
|
|
26
|
-
|
|
26
|
+
语间细节把握的很好, 但是有时候会吞掉一个字
|
|
27
27
|
|
|
28
28
|
## vad
|
|
29
29
|
|
|
30
30
|
https://pypi.org/project/vad/
|
|
31
|
-
|
|
31
|
+
也还行, 稍微有点过削
|
|
32
32
|
|
|
33
33
|
## webrtcvad
|
|
34
34
|
|
|
@@ -38,18 +38,18 @@ https://pypi.org/project/vad/
|
|
|
38
38
|
|
|
39
39
|
mode 0~3
|
|
40
40
|
|
|
41
|
-
0:
|
|
41
|
+
0: 最低的语音检测敏感度,
|
|
42
42
|
|
|
43
|
-
-
|
|
44
|
-
-
|
|
43
|
+
- 认为背景噪声不是语音, 适合环境较安静, 背景噪声少的情况。
|
|
44
|
+
- 适合环境较安静, 背景噪声少的情况。
|
|
45
45
|
|
|
46
|
-
3:
|
|
46
|
+
3: 最高的语音检测敏感度,
|
|
47
47
|
|
|
48
|
-
- VAD
|
|
49
|
-
-
|
|
48
|
+
- VAD 会非常积极地尝试将任何噪声过滤掉, 只有明确的语音才会被认为是语音。
|
|
49
|
+
- 适合环境较吵, 背景噪声多的情况。
|
|
50
50
|
|
|
51
51
|
## whisper
|
|
52
52
|
|
|
53
53
|
whisper 检测的 词与词之间的VAD 都是连着的。但其实音频不是
|
|
54
54
|
|
|
55
|
-
而且Whisper的VAD
|
|
55
|
+
而且Whisper的VAD并没有直接提供调参接口, 所以无法调整VAD的参数
|
neverlib/vad/VAD_Energy.py
CHANGED
|
@@ -47,7 +47,7 @@ if __name__ == "__main__":
|
|
|
47
47
|
sr = 16000
|
|
48
48
|
wav_path = "../wav_data/000_short.wav"
|
|
49
49
|
wav, wav_sr = sf.read(wav_path, always_2d=False, dtype="float32")
|
|
50
|
-
assert wav_sr == sr, f"音频采样率为{wav_sr}
|
|
50
|
+
assert wav_sr == sr, f"音频采样率为{wav_sr}, 期望{sr}"
|
|
51
51
|
wav = HPFilter(wav, sr=sr, order=6, cutoff=100)
|
|
52
52
|
wav = volume_norm(wav)
|
|
53
53
|
|
neverlib/vad/VAD_Silero.py
CHANGED
|
@@ -41,7 +41,7 @@ if __name__ == "__main__":
|
|
|
41
41
|
sr = 16000
|
|
42
42
|
wav_path = "../../wav_data/000_short.wav"
|
|
43
43
|
wav, wav_sr = sf.read(wav_path, always_2d=False, dtype="float32")
|
|
44
|
-
assert wav_sr == sr, f"音频采样率为{wav_sr}
|
|
44
|
+
assert wav_sr == sr, f"音频采样率为{wav_sr}, 期望{sr}"
|
|
45
45
|
wav = HPFilter(wav, sr=sr, order=6, cutoff=100)
|
|
46
46
|
wav = volume_norm(wav)
|
|
47
47
|
|
neverlib/vad/VAD_WebRTC.py
CHANGED
|
@@ -40,7 +40,7 @@ if __name__ == "__main__":
|
|
|
40
40
|
sr = 16000
|
|
41
41
|
wav_path = "../../wav_data/000_short.wav"
|
|
42
42
|
wav, wav_sr = sf.read(wav_path, always_2d=False, dtype="float32")
|
|
43
|
-
assert wav_sr == sr, f"音频采样率为{wav_sr}
|
|
43
|
+
assert wav_sr == sr, f"音频采样率为{wav_sr}, 期望{sr}"
|
|
44
44
|
wav = HPFilter(wav, sr=sr, order=6, cutoff=100)
|
|
45
45
|
wav = volume_norm(wav)
|
|
46
46
|
|
neverlib/vad/VAD_funasr.py
CHANGED
|
@@ -33,7 +33,7 @@ if __name__ == "__main__":
|
|
|
33
33
|
sr = 16000
|
|
34
34
|
wav_path = "../../wav_data/000_short.wav"
|
|
35
35
|
wav, wav_sr = sf.read(wav_path, always_2d=False, dtype="float32")
|
|
36
|
-
assert wav_sr == sr, f"音频采样率为{wav_sr}
|
|
36
|
+
assert wav_sr == sr, f"音频采样率为{wav_sr}, 期望{sr}"
|
|
37
37
|
wav = HPFilter(wav, sr=sr, order=6, cutoff=100)
|
|
38
38
|
wav = volume_norm(wav)
|
|
39
39
|
|
neverlib/vad/VAD_statistics.py
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
"""
|
|
5
5
|
https://github.com/eesungkim/Voice_Activity_Detector
|
|
6
6
|
基于统计方法的VAD, 效果还可以
|
|
7
|
-
|
|
7
|
+
语间细节把握的很好, 但是有时候会吞掉一个字
|
|
8
8
|
该文件用于作为库调用
|
|
9
9
|
"""
|
|
10
10
|
import os
|
|
@@ -256,11 +256,11 @@ class Statistics_VAD():
|
|
|
256
256
|
starts = np.where((vad_array[:-1] == 0) & (vad_array[1:] == 1))[0] + 1
|
|
257
257
|
ends = np.where((vad_array[:-1] == 1) & (vad_array[1:] == 0))[0]
|
|
258
258
|
|
|
259
|
-
# 如果活动段以1开始但没有以0
|
|
259
|
+
# 如果活动段以1开始但没有以0结束, 则需要手动添加结束点
|
|
260
260
|
if vad_array[-1] == 1:
|
|
261
261
|
ends = np.append(ends, len(vad_array) - 1)
|
|
262
262
|
|
|
263
|
-
# 如果活动段以0开始但没有以1
|
|
263
|
+
# 如果活动段以0开始但没有以1结束, 则需要手动添加起始点
|
|
264
264
|
if vad_array[0] == 1:
|
|
265
265
|
starts = np.insert(starts, 0, 0)
|
|
266
266
|
|
neverlib/vad/VAD_vadlib.py
CHANGED
|
@@ -30,7 +30,7 @@ class Vadlib_C():
|
|
|
30
30
|
|
|
31
31
|
def process(self, wav):
|
|
32
32
|
assert wav.ndim == 1, f"wav shape为{wav.shape}, 期望1D"
|
|
33
|
-
#
|
|
33
|
+
# 返回布尔阵列, 指示框架是否是语音
|
|
34
34
|
voice_activity = self.vad(wav) # (115,) [1,1,0,0,1,,1,0,0....]
|
|
35
35
|
|
|
36
36
|
window_len = int(self.frame_length / 1000 * self.sr)
|
|
@@ -49,7 +49,7 @@ if __name__ == "__main__":
|
|
|
49
49
|
sr = 16000
|
|
50
50
|
wav_path = "../../wav_data/000_short.wav"
|
|
51
51
|
wav, wav_sr = sf.read(wav_path, always_2d=False, dtype="float32")
|
|
52
|
-
assert wav_sr == sr, f"音频采样率为{wav_sr}
|
|
52
|
+
assert wav_sr == sr, f"音频采样率为{wav_sr}, 期望{sr}"
|
|
53
53
|
wav = HPFilter(wav, sr=sr, order=6, cutoff=100)
|
|
54
54
|
wav = volume_norm(wav)
|
|
55
55
|
|
neverlib/vad/VAD_whisper.py
CHANGED
|
@@ -45,7 +45,7 @@ if __name__ == "__main__":
|
|
|
45
45
|
sr = 16000
|
|
46
46
|
wav_path = "../../wav_data/000_short.wav"
|
|
47
47
|
wav, wav_sr = sf.read(wav_path, always_2d=False, dtype="float32")
|
|
48
|
-
assert wav_sr == sr, f"音频采样率为{wav_sr}
|
|
48
|
+
assert wav_sr == sr, f"音频采样率为{wav_sr}, 期望{sr}"
|
|
49
49
|
wav = HPFilter(wav, sr=sr, order=6, cutoff=100)
|
|
50
50
|
wav = volume_norm(wav)
|
|
51
51
|
|
neverlib/vad/__init__.py
CHANGED
|
@@ -15,4 +15,4 @@ from .VAD_statistics import Statistics_VAD
|
|
|
15
15
|
from .VAD_vadlib import Vadlib_C
|
|
16
16
|
from .VAD_WebRTC import WebRTC_VAD_C
|
|
17
17
|
from .VAD_whisper import Whisper_VAD_C
|
|
18
|
-
from .utils import from_vadArray_to_vadEndpoint
|
|
18
|
+
from .utils import from_vadArray_to_vadEndpoint, vad2nad
|
neverlib/vad/class_get_speech.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
# Author:凌逆战 | Never
|
|
3
3
|
# Date: 2024/9/3
|
|
4
4
|
"""
|
|
5
|
-
|
|
5
|
+
获取纯净语音, 删除静音
|
|
6
6
|
"""
|
|
7
7
|
import os
|
|
8
8
|
import numpy as np
|
|
@@ -28,9 +28,9 @@ class getSpeech():
|
|
|
28
28
|
wav, wav_sr = sf.read(wav_path, always_2d=True, dtype='float32')
|
|
29
29
|
assert wav_sr == self.sr, f"音频采样率应为{self.sr}"
|
|
30
30
|
# wav = wav / np.abs(wav).max() # 归一化
|
|
31
|
-
# voice_activity = self.vad(wav) #
|
|
31
|
+
# voice_activity = self.vad(wav) # 返回一个布尔数组, 指示帧是否为语音
|
|
32
32
|
|
|
33
|
-
#
|
|
33
|
+
# 获取语音, 删除静音
|
|
34
34
|
speech_signal = self.vad.apply_vad(wav.T).T
|
|
35
35
|
return speech_signal
|
|
36
36
|
|
|
@@ -43,7 +43,7 @@ class getSpeech():
|
|
|
43
43
|
keep_silence=False, # 在开头和结尾部分保留静音
|
|
44
44
|
)
|
|
45
45
|
|
|
46
|
-
output_audio = AudioSegment.empty() #
|
|
46
|
+
output_audio = AudioSegment.empty() # 创建一个空的音频段, 用于存储非静音部分
|
|
47
47
|
|
|
48
48
|
# 将非静音段添加到输出音频中
|
|
49
49
|
for segment in segments:
|
neverlib/vad/class_vad.py
CHANGED
|
@@ -60,7 +60,7 @@ class VADClass():
|
|
|
60
60
|
assert wav.ndim == 1, "wav must be 1D"
|
|
61
61
|
res_list = self.model.generate(input=wav)
|
|
62
62
|
# 注:VAD模型的输出格式为:[[beg1, end1], [beg2, end2], ..., [begN, endN]]
|
|
63
|
-
# 其中begN/endN表示有效音频段的起点/终点N-th
|
|
63
|
+
# 其中begN/endN表示有效音频段的起点/终点N-th, 以毫秒为单位
|
|
64
64
|
# print(res_list) # [{'key': 'rand_key_2yW4Acq9GFz6Y', 'value': [[0, 2140(ms)]]}]
|
|
65
65
|
endpint = []
|
|
66
66
|
for res in res_list:
|
neverlib/vad/utils.py
CHANGED
|
@@ -8,18 +8,60 @@ import numpy as np
|
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
def from_vadArray_to_vadEndpoint(vad_array):
|
|
11
|
+
"""
|
|
12
|
+
将VAD数组转换为VAD时间戳列表
|
|
13
|
+
Args:
|
|
14
|
+
vad_array: 1D VAD数组
|
|
15
|
+
# vad_array = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1])
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
Timestamps: [{start:xxx, end:xxx}, ...]
|
|
19
|
+
"""
|
|
11
20
|
# 计算活动段的起始点和结束点
|
|
12
|
-
# vad_array = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1])
|
|
13
21
|
# 返回 [(2320, 8079), (8400, 8719), (8880, 10959), (11600, 25039), (25840, 27439), (29040, 29359), (29520, 31759), (32240, 32399)]
|
|
14
22
|
starts = np.where((vad_array[:-1] == 0) & (vad_array[1:] == 1))[0] + 1 # +1是因为提前了一个点
|
|
15
23
|
ends = np.where((vad_array[:-1] == 1) & (vad_array[1:] == 0))[0] + 1 # + 1是因为不取最后一个点
|
|
16
24
|
|
|
17
|
-
# 如果最后一个点还是1
|
|
18
|
-
if vad_array[-1] == 1:
|
|
25
|
+
# 如果最后一个点还是1, 则需要手动添加结束点
|
|
26
|
+
if vad_array[-1] == 1:
|
|
27
|
+
ends = np.append(ends, len(vad_array))
|
|
19
28
|
# 如果第一个点就是1, 则需要手动添加起始点
|
|
20
|
-
if vad_array[0] == 1:
|
|
29
|
+
if vad_array[0] == 1:
|
|
30
|
+
starts = np.insert(starts, 0, 0)
|
|
21
31
|
assert len(starts) == len(ends), "starts and ends must have the same length"
|
|
22
32
|
|
|
23
33
|
Timestamps = [{"start": int(start), "end": int(end)} for start, end in zip(starts, ends)]
|
|
24
34
|
|
|
25
|
-
return Timestamps
|
|
35
|
+
return Timestamps
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def vad2nad(vad, total_length):
|
|
39
|
+
"""根据语音时间戳, 提取噪声时间戳 (优化版)
|
|
40
|
+
Args:
|
|
41
|
+
vad: [{start:xxx, end:xxx}, ...]
|
|
42
|
+
total_length: 音频总长度(样本数)
|
|
43
|
+
Returns:
|
|
44
|
+
nad: [{start:xxx, end:xxx}, ...] 噪声时间戳列表
|
|
45
|
+
"""
|
|
46
|
+
assert total_length > 0, "音频总长度必须大于0"
|
|
47
|
+
assert isinstance(vad, list), "vad必须是列表"
|
|
48
|
+
|
|
49
|
+
# 按开始时间排序, 确保VAD段是有序的
|
|
50
|
+
vad_sorted = sorted(vad, key=lambda x: x['start'])
|
|
51
|
+
|
|
52
|
+
nad = []
|
|
53
|
+
last_end = 0
|
|
54
|
+
for segment in vad_sorted:
|
|
55
|
+
start = segment['start']
|
|
56
|
+
# 检查当前语音段和上一个语音段/音频开头的间隙
|
|
57
|
+
if start > last_end:
|
|
58
|
+
nad.append({'start': last_end, 'end': start})
|
|
59
|
+
|
|
60
|
+
# 使用max是为了处理可能重叠的VAD段
|
|
61
|
+
last_end = max(last_end, segment['end'])
|
|
62
|
+
|
|
63
|
+
# 检查最后一个语音段到音频结尾的间隙
|
|
64
|
+
if last_end < total_length:
|
|
65
|
+
nad.append({'start': last_end, 'end': total_length})
|
|
66
|
+
|
|
67
|
+
return nad
|