neverlib 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- neverlib/.history/Docs/audio_aug/test_snr_20250806011311.py +0 -0
- neverlib/.history/Docs/audio_aug/test_snr_20250806011331.py +75 -0
- neverlib/.history/Docs/audio_aug/test_snr_20250806011342.py +57 -0
- neverlib/.history/Docs/audio_aug/test_snr_20250806011352.py +57 -0
- neverlib/.history/Docs/audio_aug/test_snr_20250806011403.py +57 -0
- neverlib/.history/Docs/audio_aug/test_snr_20250806011413.py +57 -0
- neverlib/.history/Docs/audio_aug/test_snr_20250806011435.py +55 -0
- neverlib/.history/Docs/vad/1_20250810032405.py +0 -0
- neverlib/.history/Docs/vad/1_20250810032417.py +39 -0
- neverlib/.history/audio_aug/audio_aug_20250806010451.py +125 -0
- neverlib/.history/audio_aug/audio_aug_20250806010750.py +138 -0
- neverlib/.history/audio_aug/audio_aug_20250806010759.py +140 -0
- neverlib/.history/audio_aug/audio_aug_20250806010803.py +140 -0
- neverlib/.history/audio_aug/audio_aug_20250806010809.py +140 -0
- neverlib/.history/audio_aug/audio_aug_20250806011108.py +140 -0
- neverlib/.history/dataAnalyze/__init___20250805234204.py +87 -0
- neverlib/.history/dataAnalyze/__init___20250806204125.py +14 -0
- neverlib/.history/dataAnalyze/__init___20250806204139.py +14 -0
- neverlib/.history/dataAnalyze/__init___20250806204159.py +14 -0
- neverlib/.history/filter/__init___20250820103351.py +70 -0
- neverlib/.history/filter/__init___20250821102348.py +70 -0
- neverlib/.history/filter/__init___20250821102405.py +14 -0
- neverlib/.history/filter/auto_eq/__init___20250819213121.py +36 -0
- neverlib/.history/filter/auto_eq/__init___20250821102241.py +36 -0
- neverlib/.history/filter/auto_eq/__init___20250821102259.py +36 -0
- neverlib/.history/filter/auto_eq/__init___20250821102307.py +36 -0
- neverlib/.history/filter/auto_eq/__init___20250821102310.py +36 -0
- neverlib/.history/filter/auto_eq/__init___20250821102318.py +36 -0
- neverlib/.history/filter/auto_eq/__init___20250821102507.py +36 -0
- neverlib/.history/filter/auto_eq/de_eq_20250820103848.py +361 -0
- neverlib/.history/filter/auto_eq/de_eq_20250821102422.py +360 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250805234206.py +75 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820140732.py +75 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820140745.py +75 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820140816.py +75 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820140938.py +77 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141003.py +77 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141006.py +77 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141019.py +77 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141049.py +77 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141211.py +77 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141227.py +77 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141311.py +78 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141340.py +78 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141712.py +78 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141733.py +78 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250820141755.py +78 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250821102434.py +76 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250821102500.py +76 -0
- neverlib/.history/filter/auto_eq/freq_eq_20250821102502.py +76 -0
- neverlib/.history/filter/auto_eq/ga_eq_basic_20250820102957.py +380 -0
- neverlib/.history/filter/auto_eq/ga_eq_basic_20250820113054.py +380 -0
- neverlib/.history/filter/auto_eq/ga_eq_basic_20250820113150.py +380 -0
- neverlib/.history/filter/auto_eq/ga_eq_basic_20250820113520.py +385 -0
- neverlib/.history/filter/auto_eq/ga_eq_basic_20250820113525.py +385 -0
- neverlib/.history/filter/auto_eq/ga_eq_basic_20250821102212.py +385 -0
- neverlib/.history/metrics/dnsmos_20250806001612.py +160 -0
- neverlib/.history/metrics/dnsmos_20250815180659.py +160 -0
- neverlib/.history/metrics/dnsmos_20250815180701.py +158 -0
- neverlib/.history/metrics/dnsmos_20250815181321.py +154 -0
- neverlib/.history/metrics/dnsmos_20250815181327.py +154 -0
- neverlib/.history/metrics/dnsmos_20250815181331.py +154 -0
- neverlib/.history/metrics/dnsmos_20250815181620.py +154 -0
- neverlib/.history/metrics/dnsmos_20250815181631.py +154 -0
- neverlib/.history/metrics/dnsmos_20250815181742.py +154 -0
- neverlib/.history/metrics/dnsmos_20250815181824.py +153 -0
- neverlib/.history/metrics/dnsmos_20250815181834.py +153 -0
- neverlib/.history/metrics/dnsmos_20250815181922.py +153 -0
- neverlib/.history/metrics/dnsmos_20250815182011.py +147 -0
- neverlib/.history/metrics/dnsmos_20250815182036.py +144 -0
- neverlib/.history/metrics/dnsmos_20250815182936.py +143 -0
- neverlib/.history/metrics/dnsmos_20250815182942.py +143 -0
- neverlib/.history/metrics/dnsmos_20250815183032.py +137 -0
- neverlib/.history/metrics/dnsmos_20250815183101.py +144 -0
- neverlib/.history/metrics/dnsmos_20250815183121.py +144 -0
- neverlib/.history/metrics/dnsmos_20250815183123.py +143 -0
- neverlib/.history/metrics/dnsmos_20250815183214.py +143 -0
- neverlib/.history/metrics/dnsmos_20250815183240.py +143 -0
- neverlib/.history/metrics/dnsmos_20250815183248.py +144 -0
- neverlib/.history/metrics/dnsmos_20250815183407.py +142 -0
- neverlib/.history/metrics/dnsmos_20250815183409.py +142 -0
- neverlib/.history/metrics/dnsmos_20250815183431.py +142 -0
- neverlib/.history/metrics/dnsmos_20250815183507.py +140 -0
- neverlib/.history/metrics/dnsmos_20250815183513.py +139 -0
- neverlib/.history/metrics/dnsmos_20250815183618.py +139 -0
- neverlib/.history/metrics/dnsmos_20250815183709.py +140 -0
- neverlib/.history/metrics/dnsmos_20250815183756.py +137 -0
- neverlib/.history/metrics/dnsmos_20250815183815.py +128 -0
- neverlib/.history/metrics/dnsmos_20250815183827.py +129 -0
- neverlib/.history/metrics/dnsmos_20250815183913.py +117 -0
- neverlib/.history/metrics/dnsmos_20250815183914.py +117 -0
- neverlib/.history/metrics/dnsmos_20250815184003.py +118 -0
- neverlib/.history/metrics/dnsmos_20250815184040.py +118 -0
- neverlib/.history/metrics/dnsmos_20250815184049.py +118 -0
- neverlib/.history/metrics/dnsmos_20250815184104.py +117 -0
- neverlib/.history/metrics/dnsmos_20250815184200.py +117 -0
- neverlib/.history/metrics/lpc_lsp_metric_20250816015944.py +128 -0
- neverlib/.history/metrics/lpc_lsp_metric_20250816020142.py +128 -0
- neverlib/.history/metrics/lpc_lsp_metric_20250816020156.py +128 -0
- neverlib/.history/metrics/lpc_lsp_metric_20250816020554.py +130 -0
- neverlib/.history/metrics/lpc_lsp_metric_20250816020600.py +125 -0
- neverlib/.history/metrics/lpc_lsp_metric_20250816020631.py +120 -0
- neverlib/.history/metrics/lpc_lsp_metric_20250816020746.py +118 -0
- neverlib/.history/metrics/lpc_me_20250816013111.py +0 -0
- neverlib/.history/metrics/lpc_me_20250816013129.py +121 -0
- neverlib/.history/metrics/lpc_me_20250816015430.py +103 -0
- neverlib/.history/metrics/lpc_me_20250816015535.py +96 -0
- neverlib/.history/metrics/lpc_me_20250816015542.py +96 -0
- neverlib/.history/metrics/lpc_me_20250816015636.py +97 -0
- neverlib/.history/metrics/lpc_me_20250816015658.py +104 -0
- neverlib/.history/metrics/lpc_me_20250816015703.py +100 -0
- neverlib/.history/metrics/lpc_me_20250816015945.py +128 -0
- neverlib/.history/metrics/snr_20250806010538.py +177 -0
- neverlib/.history/metrics/snr_20250806211634.py +184 -0
- neverlib/.history/metrics/spec_20250805234209.py +45 -0
- neverlib/.history/metrics/spec_20250816135530.py +11 -0
- neverlib/.history/metrics/spec_20250816135654.py +16 -0
- neverlib/.history/metrics/spec_20250816135736.py +68 -0
- neverlib/.history/metrics/spec_20250816135904.py +75 -0
- neverlib/.history/metrics/spec_20250816135921.py +82 -0
- neverlib/.history/metrics/spec_20250816140111.py +82 -0
- neverlib/.history/metrics/spec_20250816140543.py +136 -0
- neverlib/.history/metrics/spec_20250816140559.py +172 -0
- neverlib/.history/metrics/spec_20250816140602.py +172 -0
- neverlib/.history/metrics/spec_20250816140608.py +172 -0
- neverlib/.history/metrics/spec_20250816140654.py +148 -0
- neverlib/.history/metrics/spec_20250816140705.py +144 -0
- neverlib/.history/metrics/spec_20250816140755.py +138 -0
- neverlib/.history/metrics/spec_20250816140823.py +170 -0
- neverlib/.history/metrics/spec_20250816140832.py +170 -0
- neverlib/.history/metrics/spec_20250816140833.py +170 -0
- neverlib/.history/metrics/spec_20250816140922.py +147 -0
- neverlib/.history/metrics/spec_20250816141148.py +107 -0
- neverlib/.history/metrics/spec_20250816141219.py +123 -0
- neverlib/.history/metrics/spec_20250816141732.py +178 -0
- neverlib/.history/metrics/spec_20250816141740.py +178 -0
- neverlib/.history/metrics/spec_20250816142030.py +178 -0
- neverlib/.history/metrics/spec_20250816142107.py +135 -0
- neverlib/.history/metrics/spec_20250816142126.py +135 -0
- neverlib/.history/metrics/spec_20250816142410.py +135 -0
- neverlib/.history/metrics/spec_20250816142415.py +136 -0
- neverlib/.history/metrics/spec_metric_20250816135156.py +0 -0
- neverlib/.history/metrics/spec_metric_20250816135226.py +5 -0
- neverlib/.history/metrics/spec_metric_20250816135227.py +10 -0
- neverlib/.history/metrics/spec_metric_20250816135306.py +15 -0
- neverlib/.history/metrics/spec_metric_20250816135442.py +31 -0
- neverlib/.history/metrics/spec_metric_20250816135448.py +31 -0
- neverlib/.history/metrics/spec_metric_20250816135520.py +29 -0
- neverlib/.history/metrics/spec_metric_20250816135537.py +63 -0
- neverlib/.history/metrics/spec_metric_20250816135653.py +65 -0
- neverlib/.history/vad/PreProcess_20250805234211.py +63 -0
- neverlib/.history/vad/PreProcess_20250809232455.py +63 -0
- neverlib/.history/vad/PreProcess_20250816020725.py +66 -0
- neverlib/.history/vad/VAD_Silero_20250805234211.py +50 -0
- neverlib/.history/vad/VAD_Silero_20250809232456.py +50 -0
- neverlib/.history/vad/VAD_WebRTC_20250805234211.py +61 -0
- neverlib/.history/vad/VAD_WebRTC_20250809232456.py +61 -0
- neverlib/.history/vad/VAD_funasr_20250805234211.py +54 -0
- neverlib/.history/vad/VAD_funasr_20250809232456.py +54 -0
- neverlib/.history/vad/VAD_vadlib_20250805234211.py +70 -0
- neverlib/.history/vad/VAD_vadlib_20250809232455.py +70 -0
- neverlib/.history/vad/VAD_whisper_20250805234211.py +55 -0
- neverlib/.history/vad/VAD_whisper_20250809232456.py +55 -0
- neverlib/.specstory/.what-is-this.md +69 -0
- neverlib/.specstory/history/2025-08-05_17-06Z-/350/277/231/344/270/200/346/255/245/347/232/204/347/233/256/347/232/204/346/230/257/344/273/200/344/271/210.md +424 -0
- neverlib/Docs/audio_aug/test_snr.py +55 -0
- neverlib/__init__.py +2 -2
- neverlib/audio_aug/HarmonicDistortion.py +79 -0
- neverlib/audio_aug/TFDrop.py +41 -0
- neverlib/audio_aug/TFMask.py +56 -0
- neverlib/audio_aug/__init__.py +1 -1
- neverlib/audio_aug/audio_aug.py +19 -5
- neverlib/audio_aug/clip_aug.py +41 -0
- neverlib/audio_aug/coder_aug.py +209 -0
- neverlib/audio_aug/coder_aug2.py +118 -0
- neverlib/audio_aug/loss_packet_aug.py +103 -0
- neverlib/audio_aug/quant_aug.py +78 -0
- neverlib/data_analyze/README.md +234 -0
- neverlib/data_analyze/__init__.py +14 -0
- neverlib/data_analyze/dataset_analyzer.py +590 -0
- neverlib/data_analyze/quality_metrics.py +364 -0
- neverlib/data_analyze/rms_distrubution.py +62 -0
- neverlib/data_analyze/spectral_analysis.py +218 -0
- neverlib/data_analyze/statistics.py +406 -0
- neverlib/data_analyze/temporal_features.py +126 -0
- neverlib/data_analyze/visualization.py +468 -0
- neverlib/filter/README.md +101 -0
- neverlib/filter/__init__.py +7 -0
- neverlib/filter/auto_eq/README.md +165 -0
- neverlib/filter/auto_eq/__init__.py +36 -0
- neverlib/filter/auto_eq/de_eq.py +360 -0
- neverlib/filter/auto_eq/freq_eq.py +76 -0
- neverlib/filter/auto_eq/ga_eq_advanced.py +577 -0
- neverlib/filter/auto_eq/ga_eq_basic.py +385 -0
- neverlib/filter/biquad.py +45 -0
- neverlib/filter/common.py +5 -6
- neverlib/filter/core.py +339 -0
- neverlib/metrics/dnsmos.py +117 -0
- neverlib/metrics/lpc_lsp.py +118 -0
- neverlib/metrics/snr.py +184 -0
- neverlib/metrics/spec.py +136 -0
- neverlib/metrics/test_pesq.py +35 -0
- neverlib/metrics/time.py +68 -0
- neverlib/tests/test_vad.py +21 -0
- neverlib/utils/audio_split.py +2 -1
- neverlib/utils/message.py +4 -4
- neverlib/utils/utils.py +36 -16
- neverlib/vad/PreProcess.py +6 -3
- neverlib/vad/README.md +10 -10
- neverlib/vad/VAD_Energy.py +1 -1
- neverlib/vad/VAD_Silero.py +2 -2
- neverlib/vad/VAD_WebRTC.py +2 -2
- neverlib/vad/VAD_funasr.py +2 -2
- neverlib/vad/VAD_statistics.py +3 -3
- neverlib/vad/VAD_vadlib.py +3 -3
- neverlib/vad/VAD_whisper.py +2 -2
- neverlib/vad/__init__.py +1 -1
- neverlib/vad/class_get_speech.py +4 -4
- neverlib/vad/class_vad.py +1 -1
- neverlib/vad/utils.py +47 -5
- {neverlib-0.2.2.dist-info → neverlib-0.2.4.dist-info}/METADATA +120 -120
- neverlib-0.2.4.dist-info/RECORD +229 -0
- {neverlib-0.2.2.dist-info → neverlib-0.2.4.dist-info}/WHEEL +1 -1
- neverlib/Documents/vad/VAD_Energy.ipynb +0 -159
- neverlib/Documents/vad/VAD_Silero.ipynb +0 -305
- neverlib/Documents/vad/VAD_WebRTC.ipynb +0 -183
- neverlib/Documents/vad/VAD_funasr.ipynb +0 -179
- neverlib/Documents/vad/VAD_ppasr.ipynb +0 -175
- neverlib/Documents/vad/VAD_statistics.ipynb +0 -522
- neverlib/Documents/vad/VAD_vadlib.ipynb +0 -184
- neverlib/Documents/vad/VAD_whisper.ipynb +0 -430
- neverlib/utils/waveform_analyzer.py +0 -51
- neverlib/wav_data/000_short.wav +0 -0
- neverlib-0.2.2.dist-info/RECORD +0 -40
- {neverlib-0.2.2.dist-info → neverlib-0.2.4.dist-info}/licenses/LICENSE +0 -0
- {neverlib-0.2.2.dist-info → neverlib-0.2.4.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Author: 凌逆战 | Never
|
|
3
|
+
Date: 2025-08-06 10:00:00
|
|
4
|
+
Description:
|
|
5
|
+
要计算个性化 MOS 分数(干扰说话者受到惩罚),请提供“-p”参数,例如:python dnsmos.py -t ./SampleClips -o sample.csv -p
|
|
6
|
+
要计算常规 MOS 分数,请省略“-p”参数。例如:python dnsmos.py -t ./SampleClips -o sample.csv
|
|
7
|
+
'''
|
|
8
|
+
import argparse
|
|
9
|
+
import concurrent.futures
|
|
10
|
+
import glob
|
|
11
|
+
import os
|
|
12
|
+
import librosa
|
|
13
|
+
import numpy as np
|
|
14
|
+
import onnxruntime as ort
|
|
15
|
+
import pandas as pd
|
|
16
|
+
import soundfile as sf
|
|
17
|
+
from tqdm import tqdm
|
|
18
|
+
|
|
19
|
+
SAMPLING_RATE = 16000
|
|
20
|
+
INPUT_LENGTH = 9.01
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ComputeScore:
|
|
24
|
+
def __init__(self, primary_model_path, p808_model_path) -> None:
|
|
25
|
+
self.onnx_sess = ort.InferenceSession(primary_model_path)
|
|
26
|
+
self.p808_onnx_sess = ort.InferenceSession(p808_model_path)
|
|
27
|
+
|
|
28
|
+
def audio_melspec(self, audio, n_mels=120, frame_size=320, hop_length=160, sr=16000, to_db=True):
|
|
29
|
+
mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=frame_size + 1, hop_length=hop_length, n_mels=n_mels)
|
|
30
|
+
if to_db:
|
|
31
|
+
mel_spec = (librosa.power_to_db(mel_spec, ref=np.max) + 40) / 40
|
|
32
|
+
return mel_spec.T
|
|
33
|
+
|
|
34
|
+
def get_polyfit_val(self, sig, bak, ovr, is_personalized_MOS):
|
|
35
|
+
if is_personalized_MOS:
|
|
36
|
+
p_ovr = np.poly1d([-0.00533021, 0.005101, 1.18058466, -0.11236046])
|
|
37
|
+
p_sig = np.poly1d([-0.01019296, 0.02751166, 1.19576786, -0.24348726])
|
|
38
|
+
p_bak = np.poly1d([-0.04976499, 0.44276479, -0.1644611, 0.96883132])
|
|
39
|
+
else:
|
|
40
|
+
p_ovr = np.poly1d([-0.06766283, 1.11546468, 0.04602535])
|
|
41
|
+
p_sig = np.poly1d([-0.08397278, 1.22083953, 0.0052439])
|
|
42
|
+
p_bak = np.poly1d([-0.13166888, 1.60915514, -0.39604546])
|
|
43
|
+
|
|
44
|
+
sig_poly, bak_poly, ovr_poly = p_sig(sig), p_bak(bak), p_ovr(ovr)
|
|
45
|
+
=
|
|
46
|
+
=
|
|
47
|
+
|
|
48
|
+
return sig_poly, bak_poly, ovr_poly
|
|
49
|
+
|
|
50
|
+
def __call__(self, fpath, sampling_rate, is_personalized_MOS):
|
|
51
|
+
aud, input_fs = sf.read(fpath)
|
|
52
|
+
fs = sampling_rate
|
|
53
|
+
if input_fs != fs:
|
|
54
|
+
audio = librosa.resample(aud, input_fs, fs)
|
|
55
|
+
else:
|
|
56
|
+
audio = aud
|
|
57
|
+
actual_audio_len = len(audio)
|
|
58
|
+
len_samples = int(INPUT_LENGTH * fs)
|
|
59
|
+
while len(audio) < len_samples:
|
|
60
|
+
audio = np.append(audio, audio)
|
|
61
|
+
|
|
62
|
+
num_hops = int(np.floor(len(audio) / fs) - INPUT_LENGTH) + 1
|
|
63
|
+
hop_len_samples = fs
|
|
64
|
+
predicted_mos_sig_seg_raw = []
|
|
65
|
+
predicted_mos_bak_seg_raw = []
|
|
66
|
+
predicted_mos_ovr_seg_raw = []
|
|
67
|
+
predicted_mos_sig_seg = []
|
|
68
|
+
predicted_mos_bak_seg = []
|
|
69
|
+
predicted_mos_ovr_seg = []
|
|
70
|
+
predicted_p808_mos = []
|
|
71
|
+
|
|
72
|
+
for idx in range(num_hops):
|
|
73
|
+
audio_seg = audio[int(idx * hop_len_samples): int((idx + INPUT_LENGTH) * hop_len_samples)]
|
|
74
|
+
if len(audio_seg) < len_samples:
|
|
75
|
+
continue
|
|
76
|
+
|
|
77
|
+
input_features = np.array(audio_seg).astype('float32')[np.newaxis, :]
|
|
78
|
+
p808_input_features = np.array(self.audio_melspec(audio=audio_seg[:-160])).astype('float32')[np.newaxis, :, :]
|
|
79
|
+
oi = {'input_1': input_features}
|
|
80
|
+
p808_oi = {'input_1': p808_input_features}
|
|
81
|
+
p808_mos = self.p808_onnx_sess.run(None, p808_oi)[0][0][0]
|
|
82
|
+
mos_sig_raw, mos_bak_raw, mos_ovr_raw = self.onnx_sess.run(None, oi)[0][0]
|
|
83
|
+
mos_sig, mos_bak, mos_ovr = self.get_polyfit_val(mos_sig_raw, mos_bak_raw, mos_ovr_raw, is_personalized_MOS)
|
|
84
|
+
predicted_mos_sig_seg_raw.append(mos_sig_raw)
|
|
85
|
+
predicted_mos_bak_seg_raw.append(mos_bak_raw)
|
|
86
|
+
predicted_mos_ovr_seg_raw.append(mos_ovr_raw)
|
|
87
|
+
predicted_mos_sig_seg.append(mos_sig)
|
|
88
|
+
predicted_mos_bak_seg.append(mos_bak)
|
|
89
|
+
predicted_mos_ovr_seg.append(mos_ovr)
|
|
90
|
+
predicted_p808_mos.append(p808_mos)
|
|
91
|
+
|
|
92
|
+
clip_dict = {'filename': fpath, 'len_in_sec': actual_audio_len / fs, 'sr': fs}
|
|
93
|
+
clip_dict['num_hops'] = num_hops
|
|
94
|
+
clip_dict['OVRL_raw'] = np.mean(predicted_mos_ovr_seg_raw)
|
|
95
|
+
clip_dict['SIG_raw'] = np.mean(predicted_mos_sig_seg_raw)
|
|
96
|
+
clip_dict['BAK_raw'] = np.mean(predicted_mos_bak_seg_raw)
|
|
97
|
+
clip_dict['OVRL'] = np.mean(predicted_mos_ovr_seg)
|
|
98
|
+
clip_dict['SIG'] = np.mean(predicted_mos_sig_seg)
|
|
99
|
+
clip_dict['BAK'] = np.mean(predicted_mos_bak_seg)
|
|
100
|
+
clip_dict['P808_MOS'] = np.mean(predicted_p808_mos)
|
|
101
|
+
return clip_dict
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def main(args):
|
|
105
|
+
models = glob.glob(os.path.join(args.testset_dir, "*"))
|
|
106
|
+
audio_clips_list = []
|
|
107
|
+
p808_model_path = os.path.join('DNSMOS', 'model_v8.onnx')
|
|
108
|
+
|
|
109
|
+
if args.personalized_MOS:
|
|
110
|
+
primary_model_path = os.path.join('pDNSMOS', 'sig_bak_ovr.onnx')
|
|
111
|
+
else:
|
|
112
|
+
primary_model_path = os.path.join('DNSMOS', 'sig_bak_ovr.onnx')
|
|
113
|
+
|
|
114
|
+
compute_score = ComputeScore(primary_model_path, p808_model_path)
|
|
115
|
+
|
|
116
|
+
rows = []
|
|
117
|
+
clips = []
|
|
118
|
+
clips = glob.glob(os.path.join(args.testset_dir, "*.wav"))
|
|
119
|
+
is_personalized_eval = args.personalized_MOS
|
|
120
|
+
desired_fs = SAMPLING_RATE
|
|
121
|
+
for m in tqdm(models):
|
|
122
|
+
max_recursion_depth = 10
|
|
123
|
+
audio_path = os.path.join(args.testset_dir, m)
|
|
124
|
+
audio_clips_list = glob.glob(os.path.join(audio_path, "*.wav"))
|
|
125
|
+
while len(audio_clips_list) == 0 and max_recursion_depth > 0:
|
|
126
|
+
audio_path = os.path.join(audio_path, "**")
|
|
127
|
+
audio_clips_list = glob.glob(os.path.join(audio_path, "*.wav"))
|
|
128
|
+
max_recursion_depth -= 1
|
|
129
|
+
clips.extend(audio_clips_list)
|
|
130
|
+
|
|
131
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
132
|
+
future_to_url = {executor.submit(compute_score, clip, desired_fs, is_personalized_eval): clip for clip in clips}
|
|
133
|
+
for future in tqdm(concurrent.futures.as_completed(future_to_url)):
|
|
134
|
+
clip = future_to_url[future]
|
|
135
|
+
try:
|
|
136
|
+
data = future.result()
|
|
137
|
+
except Exception as exc:
|
|
138
|
+
print('%r generated an exception: %s' % (clip, exc))
|
|
139
|
+
else:
|
|
140
|
+
rows.append(data)
|
|
141
|
+
|
|
142
|
+
df = pd.DataFrame(rows)
|
|
143
|
+
if args.csv_path:
|
|
144
|
+
csv_path = args.csv_path
|
|
145
|
+
df.to_csv(csv_path)
|
|
146
|
+
else:
|
|
147
|
+
print(df.describe())
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
if __name__ == "__main__":
|
|
151
|
+
parser = argparse.ArgumentParser()
|
|
152
|
+
parser.add_argument('-t', "--testset_dir", default='.',
|
|
153
|
+
help='Path to the dir containing audio clips in .wav to be evaluated')
|
|
154
|
+
parser.add_argument('-o', "--csv_path", default=None, help='Dir to the csv that saves the results')
|
|
155
|
+
parser.add_argument('-p', "--personalized_MOS", action='store_true',
|
|
156
|
+
help='Flag to indicate if personalized MOS score is needed or regular')
|
|
157
|
+
|
|
158
|
+
args = parser.parse_args()
|
|
159
|
+
|
|
160
|
+
main(args)
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Author: 凌逆战 | Never
|
|
3
|
+
Date: 2025-08-06 10:00:00
|
|
4
|
+
Description:
|
|
5
|
+
要计算个性化 MOS 分数(干扰说话者受到惩罚),请提供“-p”参数,例如:python dnsmos.py -t ./SampleClips -o sample.csv -p
|
|
6
|
+
要计算常规 MOS 分数,请省略“-p”参数。例如:python dnsmos.py -t ./SampleClips -o sample.csv
|
|
7
|
+
'''
|
|
8
|
+
import argparse
|
|
9
|
+
import concurrent.futures
|
|
10
|
+
import glob
|
|
11
|
+
import os
|
|
12
|
+
import librosa
|
|
13
|
+
import numpy as np
|
|
14
|
+
import onnxruntime as ort
|
|
15
|
+
import pandas as pd
|
|
16
|
+
import soundfile as sf
|
|
17
|
+
from tqdm import tqdm
|
|
18
|
+
|
|
19
|
+
SAMPLING_RATE = 16000
|
|
20
|
+
INPUT_LENGTH = 9.01
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ComputeScore:
|
|
24
|
+
def __init__(self, primary_model_path, p808_model_path) -> None:
|
|
25
|
+
self.onnx_sess = ort.InferenceSession(primary_model_path)
|
|
26
|
+
self.p808_onnx_sess = ort.InferenceSession(p808_model_path)
|
|
27
|
+
|
|
28
|
+
def audio_melspec(self, audio, n_mels=120, frame_size=320, hop_length=160, sr=16000, to_db=True):
|
|
29
|
+
mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=frame_size + 1, hop_length=hop_length, n_mels=n_mels)
|
|
30
|
+
if to_db:
|
|
31
|
+
mel_spec = (librosa.power_to_db(mel_spec, ref=np.max) + 40) / 40
|
|
32
|
+
return mel_spec.T
|
|
33
|
+
|
|
34
|
+
def get_polyfit_val(self, sig, bak, ovr, is_personalized_MOS):
|
|
35
|
+
if is_personalized_MOS:
|
|
36
|
+
p_ovr = np.poly1d([-0.00533021, 0.005101, 1.18058466, -0.11236046])
|
|
37
|
+
p_sig = np.poly1d([-0.01019296, 0.02751166, 1.19576786, -0.24348726])
|
|
38
|
+
p_bak = np.poly1d([-0.04976499, 0.44276479, -0.1644611, 0.96883132])
|
|
39
|
+
else:
|
|
40
|
+
p_ovr = np.poly1d([-0.06766283, 1.11546468, 0.04602535])
|
|
41
|
+
p_sig = np.poly1d([-0.08397278, 1.22083953, 0.0052439])
|
|
42
|
+
p_bak = np.poly1d([-0.13166888, 1.60915514, -0.39604546])
|
|
43
|
+
|
|
44
|
+
sig_poly, bak_poly, ovr_poly = p_sig(sig), p_bak(bak), p_ovr(ovr)
|
|
45
|
+
|
|
46
|
+
return sig_poly, bak_poly, ovr_poly
|
|
47
|
+
|
|
48
|
+
def __call__(self, fpath, sampling_rate, is_personalized_MOS):
|
|
49
|
+
aud, input_fs = sf.read(fpath)
|
|
50
|
+
fs = sampling_rate
|
|
51
|
+
if input_fs != fs:
|
|
52
|
+
audio = librosa.resample(aud, input_fs, fs)
|
|
53
|
+
else:
|
|
54
|
+
audio = aud
|
|
55
|
+
actual_audio_len = len(audio)
|
|
56
|
+
len_samples = int(INPUT_LENGTH * fs)
|
|
57
|
+
while len(audio) < len_samples:
|
|
58
|
+
audio = np.append(audio, audio)
|
|
59
|
+
|
|
60
|
+
num_hops = int(np.floor(len(audio) / fs) - INPUT_LENGTH) + 1
|
|
61
|
+
hop_len_samples = fs
|
|
62
|
+
predicted_mos_sig_seg_raw = []
|
|
63
|
+
predicted_mos_bak_seg_raw = []
|
|
64
|
+
predicted_mos_ovr_seg_raw = []
|
|
65
|
+
predicted_mos_sig_seg = []
|
|
66
|
+
predicted_mos_bak_seg = []
|
|
67
|
+
predicted_mos_ovr_seg = []
|
|
68
|
+
predicted_p808_mos = []
|
|
69
|
+
|
|
70
|
+
for idx in range(num_hops):
|
|
71
|
+
audio_seg = audio[int(idx * hop_len_samples): int((idx + INPUT_LENGTH) * hop_len_samples)]
|
|
72
|
+
if len(audio_seg) < len_samples:
|
|
73
|
+
continue
|
|
74
|
+
|
|
75
|
+
input_features = np.array(audio_seg).astype('float32')[np.newaxis, :]
|
|
76
|
+
p808_input_features = np.array(self.audio_melspec(audio=audio_seg[:-160])).astype('float32')[np.newaxis, :, :]
|
|
77
|
+
oi = {'input_1': input_features}
|
|
78
|
+
p808_oi = {'input_1': p808_input_features}
|
|
79
|
+
p808_mos = self.p808_onnx_sess.run(None, p808_oi)[0][0][0]
|
|
80
|
+
mos_sig_raw, mos_bak_raw, mos_ovr_raw = self.onnx_sess.run(None, oi)[0][0]
|
|
81
|
+
mos_sig, mos_bak, mos_ovr = self.get_polyfit_val(mos_sig_raw, mos_bak_raw, mos_ovr_raw, is_personalized_MOS)
|
|
82
|
+
predicted_mos_sig_seg_raw.append(mos_sig_raw)
|
|
83
|
+
predicted_mos_bak_seg_raw.append(mos_bak_raw)
|
|
84
|
+
predicted_mos_ovr_seg_raw.append(mos_ovr_raw)
|
|
85
|
+
predicted_mos_sig_seg.append(mos_sig)
|
|
86
|
+
predicted_mos_bak_seg.append(mos_bak)
|
|
87
|
+
predicted_mos_ovr_seg.append(mos_ovr)
|
|
88
|
+
predicted_p808_mos.append(p808_mos)
|
|
89
|
+
|
|
90
|
+
clip_dict = {'filename': fpath, 'len_in_sec': actual_audio_len / fs, 'sr': fs}
|
|
91
|
+
clip_dict['num_hops'] = num_hops
|
|
92
|
+
clip_dict['OVRL_raw'] = np.mean(predicted_mos_ovr_seg_raw)
|
|
93
|
+
clip_dict['SIG_raw'] = np.mean(predicted_mos_sig_seg_raw)
|
|
94
|
+
clip_dict['BAK_raw'] = np.mean(predicted_mos_bak_seg_raw)
|
|
95
|
+
clip_dict['OVRL'] = np.mean(predicted_mos_ovr_seg)
|
|
96
|
+
clip_dict['SIG'] = np.mean(predicted_mos_sig_seg)
|
|
97
|
+
clip_dict['BAK'] = np.mean(predicted_mos_bak_seg)
|
|
98
|
+
clip_dict['P808_MOS'] = np.mean(predicted_p808_mos)
|
|
99
|
+
return clip_dict
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def main(args):
|
|
103
|
+
models = glob.glob(os.path.join(args.testset_dir, "*"))
|
|
104
|
+
audio_clips_list = []
|
|
105
|
+
p808_model_path = os.path.join('DNSMOS', 'model_v8.onnx')
|
|
106
|
+
|
|
107
|
+
if args.personalized_MOS:
|
|
108
|
+
primary_model_path = os.path.join('pDNSMOS', 'sig_bak_ovr.onnx')
|
|
109
|
+
else:
|
|
110
|
+
primary_model_path = os.path.join('DNSMOS', 'sig_bak_ovr.onnx')
|
|
111
|
+
|
|
112
|
+
compute_score = ComputeScore(primary_model_path, p808_model_path)
|
|
113
|
+
|
|
114
|
+
rows = []
|
|
115
|
+
clips = []
|
|
116
|
+
clips = glob.glob(os.path.join(args.testset_dir, "*.wav"))
|
|
117
|
+
is_personalized_eval = args.personalized_MOS
|
|
118
|
+
desired_fs = SAMPLING_RATE
|
|
119
|
+
for m in tqdm(models):
|
|
120
|
+
max_recursion_depth = 10
|
|
121
|
+
audio_path = os.path.join(args.testset_dir, m)
|
|
122
|
+
audio_clips_list = glob.glob(os.path.join(audio_path, "*.wav"))
|
|
123
|
+
while len(audio_clips_list) == 0 and max_recursion_depth > 0:
|
|
124
|
+
audio_path = os.path.join(audio_path, "**")
|
|
125
|
+
audio_clips_list = glob.glob(os.path.join(audio_path, "*.wav"))
|
|
126
|
+
max_recursion_depth -= 1
|
|
127
|
+
clips.extend(audio_clips_list)
|
|
128
|
+
|
|
129
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
130
|
+
future_to_url = {executor.submit(compute_score, clip, desired_fs, is_personalized_eval): clip for clip in clips}
|
|
131
|
+
for future in tqdm(concurrent.futures.as_completed(future_to_url)):
|
|
132
|
+
clip = future_to_url[future]
|
|
133
|
+
try:
|
|
134
|
+
data = future.result()
|
|
135
|
+
except Exception as exc:
|
|
136
|
+
print('%r generated an exception: %s' % (clip, exc))
|
|
137
|
+
else:
|
|
138
|
+
rows.append(data)
|
|
139
|
+
|
|
140
|
+
df = pd.DataFrame(rows)
|
|
141
|
+
if args.csv_path:
|
|
142
|
+
csv_path = args.csv_path
|
|
143
|
+
df.to_csv(csv_path)
|
|
144
|
+
else:
|
|
145
|
+
print(df.describe())
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
if __name__ == "__main__":
|
|
149
|
+
parser = argparse.ArgumentParser()
|
|
150
|
+
parser.add_argument('-t', "--testset_dir", default='.',
|
|
151
|
+
help='Path to the dir containing audio clips in .wav to be evaluated')
|
|
152
|
+
parser.add_argument('-o', "--csv_path", default=None, help='Dir to the csv that saves the results')
|
|
153
|
+
parser.add_argument('-p', "--personalized_MOS", action='store_true',
|
|
154
|
+
help='Flag to indicate if personalized MOS score is needed or regular')
|
|
155
|
+
|
|
156
|
+
args = parser.parse_args()
|
|
157
|
+
|
|
158
|
+
main(args)
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Author: 凌逆战 | Never
|
|
3
|
+
Date: 2025-08-06 10:00:00
|
|
4
|
+
Description:
|
|
5
|
+
要计算个性化 MOS 分数(干扰说话者受到惩罚),请提供“-p”参数,例如:python dnsmos.py -t ./SampleClips -o sample.csv -p
|
|
6
|
+
要计算常规 MOS 分数,请省略“-p”参数。例如:python dnsmos.py -t ./SampleClips -o sample.csv
|
|
7
|
+
'''
|
|
8
|
+
import argparse
|
|
9
|
+
import concurrent.futures
|
|
10
|
+
import glob
|
|
11
|
+
import os
|
|
12
|
+
import librosa
|
|
13
|
+
import numpy as np
|
|
14
|
+
import onnxruntime as ort
|
|
15
|
+
import pandas as pd
|
|
16
|
+
import soundfile as sf
|
|
17
|
+
from tqdm import tqdm
|
|
18
|
+
|
|
19
|
+
SAMPLING_RATE = 16000
|
|
20
|
+
INPUT_LENGTH = 9.01
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ComputeScore:
|
|
24
|
+
def __init__(self, primary_model_path, p808_model_path) -> None:
|
|
25
|
+
self.onnx_sess = ort.InferenceSession(primary_model_path)
|
|
26
|
+
self.p808_onnx_sess = ort.InferenceSession(p808_model_path)
|
|
27
|
+
|
|
28
|
+
def audio_melspec(self, audio, n_mels=120, frame_size=320, hop_length=160, sr=16000, to_db=True):
|
|
29
|
+
mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=frame_size + 1, hop_length=hop_length, n_mels=n_mels)
|
|
30
|
+
if to_db:
|
|
31
|
+
mel_spec = (librosa.power_to_db(mel_spec, ref=np.max) + 40) / 40
|
|
32
|
+
return mel_spec.T
|
|
33
|
+
|
|
34
|
+
def get_polyfit_val(self, sig, bak, ovr, is_personalized_MOS):
|
|
35
|
+
if is_personalized_MOS:
|
|
36
|
+
p_ovr = np.poly1d([-0.00533021, 0.005101, 1.18058466, -0.11236046])
|
|
37
|
+
p_sig = np.poly1d([-0.01019296, 0.02751166, 1.19576786, -0.24348726])
|
|
38
|
+
p_bak = np.poly1d([-0.04976499, 0.44276479, -0.1644611, 0.96883132])
|
|
39
|
+
else:
|
|
40
|
+
p_ovr = np.poly1d([-0.06766283, 1.11546468, 0.04602535])
|
|
41
|
+
p_sig = np.poly1d([-0.08397278, 1.22083953, 0.0052439])
|
|
42
|
+
p_bak = np.poly1d([-0.13166888, 1.60915514, -0.39604546])
|
|
43
|
+
|
|
44
|
+
sig_poly, bak_poly, ovr_poly = p_sig(sig), p_bak(bak), p_ovr(ovr)
|
|
45
|
+
|
|
46
|
+
return sig_poly, bak_poly, ovr_poly
|
|
47
|
+
|
|
48
|
+
def __call__(self, fpath, sampling_rate, is_personalized_MOS):
|
|
49
|
+
aud, input_fs = sf.read(fpath)
|
|
50
|
+
fs = sampling_rate
|
|
51
|
+
if input_fs != fs:
|
|
52
|
+
audio = librosa.resample(aud, input_fs, fs)
|
|
53
|
+
else:
|
|
54
|
+
audio = aud
|
|
55
|
+
actual_audio_len = len(audio)
|
|
56
|
+
len_samples = int(INPUT_LENGTH * fs)
|
|
57
|
+
while len(audio) < len_samples:
|
|
58
|
+
audio = np.append(audio, audio)
|
|
59
|
+
|
|
60
|
+
num_hops = int(np.floor(len(audio) / fs) - INPUT_LENGTH) + 1
|
|
61
|
+
hop_len_samples = fs
|
|
62
|
+
predicted_mos_sig_seg_raw = []
|
|
63
|
+
predicted_mos_bak_seg_raw = []
|
|
64
|
+
predicted_mos_ovr_seg_raw = []
|
|
65
|
+
predicted_mos_sig_seg = []
|
|
66
|
+
predicted_mos_bak_seg = []
|
|
67
|
+
predicted_mos_ovr_seg = []
|
|
68
|
+
predicted_p808_mos = []
|
|
69
|
+
|
|
70
|
+
for idx in range(num_hops):
|
|
71
|
+
audio_seg = audio[int(idx * hop_len_samples): int((idx + INPUT_LENGTH) * hop_len_samples)]
|
|
72
|
+
if len(audio_seg) < len_samples:
|
|
73
|
+
continue
|
|
74
|
+
|
|
75
|
+
input_features = np.array(audio_seg).astype('float32')[np.newaxis, :]
|
|
76
|
+
p808_input_features = np.array(self.audio_melspec(audio=audio_seg[:-160])).astype('float32')[np.newaxis, :, :]
|
|
77
|
+
oi = {'input_1': input_features}
|
|
78
|
+
p808_oi = {'input_1': p808_input_features}
|
|
79
|
+
p808_mos = self.p808_onnx_sess.run(None, p808_oi)[0][0][0]
|
|
80
|
+
mos_sig_raw, mos_bak_raw, mos_ovr_raw = self.onnx_sess.run(None, oi)[0][0]
|
|
81
|
+
mos_sig, mos_bak, mos_ovr = self.get_polyfit_val(mos_sig_raw, mos_bak_raw, mos_ovr_raw, is_personalized_MOS)
|
|
82
|
+
predicted_mos_sig_seg_raw.append(mos_sig_raw)
|
|
83
|
+
predicted_mos_bak_seg_raw.append(mos_bak_raw)
|
|
84
|
+
predicted_mos_ovr_seg_raw.append(mos_ovr_raw)
|
|
85
|
+
predicted_mos_sig_seg.append(mos_sig)
|
|
86
|
+
predicted_mos_bak_seg.append(mos_bak)
|
|
87
|
+
predicted_mos_ovr_seg.append(mos_ovr)
|
|
88
|
+
predicted_p808_mos.append(p808_mos)
|
|
89
|
+
|
|
90
|
+
clip_dict = {'filename': fpath, 'len_in_sec': actual_audio_len / fs, 'sr': fs}
|
|
91
|
+
clip_dict['num_hops'] = num_hops
|
|
92
|
+
clip_dict['OVRL_raw'] = np.mean(predicted_mos_ovr_seg_raw)
|
|
93
|
+
clip_dict['SIG_raw'] = np.mean(predicted_mos_sig_seg_raw)
|
|
94
|
+
clip_dict['BAK_raw'] = np.mean(predicted_mos_bak_seg_raw)
|
|
95
|
+
clip_dict['OVRL'] = np.mean(predicted_mos_ovr_seg)
|
|
96
|
+
clip_dict['SIG'] = np.mean(predicted_mos_sig_seg)
|
|
97
|
+
clip_dict['BAK'] = np.mean(predicted_mos_bak_seg)
|
|
98
|
+
clip_dict['P808_MOS'] = np.mean(predicted_p808_mos)
|
|
99
|
+
return clip_dict
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def main(args):
|
|
103
|
+
models = glob.glob(os.path.join(args.testset_dir, "*"))
|
|
104
|
+
audio_clips_list = []
|
|
105
|
+
p808_model_path = os.path.join('DNSMOS', 'model_v8.onnx')
|
|
106
|
+
|
|
107
|
+
if args.personalized_MOS:
|
|
108
|
+
primary_model_path = os.path.join('pDNSMOS', 'sig_bak_ovr.onnx')
|
|
109
|
+
else:
|
|
110
|
+
primary_model_path = os.path.join('DNSMOS', 'sig_bak_ovr.onnx')
|
|
111
|
+
|
|
112
|
+
compute_score = ComputeScore(primary_model_path, p808_model_path)
|
|
113
|
+
|
|
114
|
+
rows = []
|
|
115
|
+
clips = []
|
|
116
|
+
clips = glob.glob(os.path.join(args.testset_dir, "*.wav"))
|
|
117
|
+
is_personalized_eval = args.personalized_MOS
|
|
118
|
+
desired_fs = SAMPLING_RATE
|
|
119
|
+
for m in tqdm(models):
|
|
120
|
+
max_recursion_depth = 10
|
|
121
|
+
audio_path = os.path.join(args.testset_dir, m)
|
|
122
|
+
audio_clips_list = glob.glob(os.path.join(audio_path, "*.wav"))
|
|
123
|
+
while len(audio_clips_list) == 0 and max_recursion_depth > 0:
|
|
124
|
+
audio_path = os.path.join(audio_path, "**")
|
|
125
|
+
audio_clips_list = glob.glob(os.path.join(audio_path, "*.wav"))
|
|
126
|
+
max_recursion_depth -= 1
|
|
127
|
+
clips.extend(audio_clips_list)
|
|
128
|
+
|
|
129
|
+
for clip in tqdm(clips):
|
|
130
|
+
try:
|
|
131
|
+
data = compute_score(clip, desired_fs, is_personalized_eval)
|
|
132
|
+
rows.append(data)
|
|
133
|
+
except Exception as exc:
|
|
134
|
+
print('%r generated an exception: %s' % (clip, exc))
|
|
135
|
+
|
|
136
|
+
df = pd.DataFrame(rows)
|
|
137
|
+
if args.csv_path:
|
|
138
|
+
csv_path = args.csv_path
|
|
139
|
+
df.to_csv(csv_path)
|
|
140
|
+
else:
|
|
141
|
+
print(df.describe())
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
if __name__ == "__main__":
|
|
145
|
+
parser = argparse.ArgumentParser()
|
|
146
|
+
parser.add_argument('-t', "--testset_dir", default='.',
|
|
147
|
+
help='Path to the dir containing audio clips in .wav to be evaluated')
|
|
148
|
+
parser.add_argument('-o', "--csv_path", default=None, help='Dir to the csv that saves the results')
|
|
149
|
+
parser.add_argument('-p', "--personalized_MOS", action='store_true',
|
|
150
|
+
help='Flag to indicate if personalized MOS score is needed or regular')
|
|
151
|
+
|
|
152
|
+
args = parser.parse_args()
|
|
153
|
+
|
|
154
|
+
main(args)
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Author: 凌逆战 | Never
|
|
3
|
+
Date: 2025-08-06 10:00:00
|
|
4
|
+
Description:
|
|
5
|
+
要计算个性化 MOS 分数(干扰说话者受到惩罚),请提供“-p”参数,例如:python dnsmos.py -t ./SampleClips -o sample.csv -p
|
|
6
|
+
要计算常规 MOS 分数,请省略“-p”参数。例如:python dnsmos.py -t ./SampleClips -o sample.csv
|
|
7
|
+
'''
|
|
8
|
+
import argparse
|
|
9
|
+
import concurrent.futures
|
|
10
|
+
import glob
|
|
11
|
+
import os
|
|
12
|
+
import librosa
|
|
13
|
+
import numpy as np
|
|
14
|
+
import onnxruntime as ort
|
|
15
|
+
import pandas as pd
|
|
16
|
+
import soundfile as sf
|
|
17
|
+
from tqdm import tqdm
|
|
18
|
+
|
|
19
|
+
SAMPLING_RATE = 16000
|
|
20
|
+
INPUT_LENGTH = 9.01
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ComputeScore:
|
|
24
|
+
def __init__(self, primary_model_path, p808_model_path) -> None:
|
|
25
|
+
self.onnx_sess = ort.InferenceSession(primary_model_path)
|
|
26
|
+
self.p808_onnx_sess = ort.InferenceSession(p808_model_path)
|
|
27
|
+
|
|
28
|
+
def audio_melspec(self, audio, n_mels=120, frame_size=320, hop_length=160, sr=16000, to_db=True):
|
|
29
|
+
mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=frame_size + 1, hop_length=hop_length, n_mels=n_mels)
|
|
30
|
+
if to_db:
|
|
31
|
+
mel_spec = (librosa.power_to_db(mel_spec, ref=np.max) + 40) / 40
|
|
32
|
+
return mel_spec.T
|
|
33
|
+
|
|
34
|
+
def get_polyfit_val(self, sig, bak, ovr, is_personalized_MOS):
|
|
35
|
+
if is_personalized_MOS:
|
|
36
|
+
p_ovr = np.poly1d([-0.00533021, 0.005101, 1.18058466, -0.11236046])
|
|
37
|
+
p_sig = np.poly1d([-0.01019296, 0.02751166, 1.19576786, -0.24348726])
|
|
38
|
+
p_bak = np.poly1d([-0.04976499, 0.44276479, -0.1644611, 0.96883132])
|
|
39
|
+
else:
|
|
40
|
+
p_ovr = np.poly1d([-0.06766283, 1.11546468, 0.04602535])
|
|
41
|
+
p_sig = np.poly1d([-0.08397278, 1.22083953, 0.0052439])
|
|
42
|
+
p_bak = np.poly1d([-0.13166888, 1.60915514, -0.39604546])
|
|
43
|
+
|
|
44
|
+
sig_poly, bak_poly, ovr_poly = p_sig(sig), p_bak(bak), p_ovr(ovr)
|
|
45
|
+
|
|
46
|
+
return sig_poly, bak_poly, ovr_poly
|
|
47
|
+
|
|
48
|
+
def __call__(self, fpath, sampling_rate, is_personalized_MOS):
|
|
49
|
+
aud, input_fs = sf.read(fpath)
|
|
50
|
+
fs = sampling_rate
|
|
51
|
+
if input_fs != fs:
|
|
52
|
+
audio = librosa.resample(aud, input_fs, fs)
|
|
53
|
+
else:
|
|
54
|
+
audio = aud
|
|
55
|
+
actual_audio_len = len(audio)
|
|
56
|
+
len_samples = int(INPUT_LENGTH * fs)
|
|
57
|
+
while len(audio) < len_samples:
|
|
58
|
+
audio = np.append(audio, audio)
|
|
59
|
+
|
|
60
|
+
num_hops = int(np.floor(len(audio) / fs) - INPUT_LENGTH) + 1
|
|
61
|
+
hop_len_samples = fs
|
|
62
|
+
predicted_mos_sig_seg_raw = []
|
|
63
|
+
predicted_mos_bak_seg_raw = []
|
|
64
|
+
predicted_mos_ovr_seg_raw = []
|
|
65
|
+
predicted_mos_sig_seg = []
|
|
66
|
+
predicted_mos_bak_seg = []
|
|
67
|
+
predicted_mos_ovr_seg = []
|
|
68
|
+
predicted_p808_mos = []
|
|
69
|
+
|
|
70
|
+
for idx in range(num_hops):
|
|
71
|
+
audio_seg = audio[int(idx * hop_len_samples): int((idx + INPUT_LENGTH) * hop_len_samples)]
|
|
72
|
+
if len(audio_seg) < len_samples:
|
|
73
|
+
continue
|
|
74
|
+
|
|
75
|
+
input_features = np.array(audio_seg).astype('float32')[np.newaxis, :]
|
|
76
|
+
p808_input_features = np.array(self.audio_melspec(audio=audio_seg[:-160])).astype('float32')[np.newaxis, :, :]
|
|
77
|
+
oi = {'input_1': input_features}
|
|
78
|
+
p808_oi = {'input_1': p808_input_features}
|
|
79
|
+
p808_mos = self.p808_onnx_sess.run(None, p808_oi)[0][0][0]
|
|
80
|
+
mos_sig_raw, mos_bak_raw, mos_ovr_raw = self.onnx_sess.run(None, oi)[0][0]
|
|
81
|
+
mos_sig, mos_bak, mos_ovr = self.get_polyfit_val(mos_sig_raw, mos_bak_raw, mos_ovr_raw, is_personalized_MOS)
|
|
82
|
+
predicted_mos_sig_seg_raw.append(mos_sig_raw)
|
|
83
|
+
predicted_mos_bak_seg_raw.append(mos_bak_raw)
|
|
84
|
+
predicted_mos_ovr_seg_raw.append(mos_ovr_raw)
|
|
85
|
+
predicted_mos_sig_seg.append(mos_sig)
|
|
86
|
+
predicted_mos_bak_seg.append(mos_bak)
|
|
87
|
+
predicted_mos_ovr_seg.append(mos_ovr)
|
|
88
|
+
predicted_p808_mos.append(p808_mos)
|
|
89
|
+
|
|
90
|
+
clip_dict = {'filename': fpath, 'len_in_sec': actual_audio_len / fs, 'sr': fs}
|
|
91
|
+
clip_dict['num_hops'] = num_hops
|
|
92
|
+
clip_dict['OVRL_raw'] = np.mean(predicted_mos_ovr_seg_raw)
|
|
93
|
+
clip_dict['SIG_raw'] = np.mean(predicted_mos_sig_seg_raw)
|
|
94
|
+
clip_dict['BAK_raw'] = np.mean(predicted_mos_bak_seg_raw)
|
|
95
|
+
clip_dict['OVRL'] = np.mean(predicted_mos_ovr_seg)
|
|
96
|
+
clip_dict['SIG'] = np.mean(predicted_mos_sig_seg)
|
|
97
|
+
clip_dict['BAK'] = np.mean(predicted_mos_bak_seg)
|
|
98
|
+
clip_dict['P808_MOS'] = np.mean(predicted_p808_mos)
|
|
99
|
+
return clip_dict
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def main(args):
|
|
103
|
+
models = glob.glob(os.path.join(args.testset_dir, "*"))
|
|
104
|
+
audio_clips_list = []
|
|
105
|
+
p808_model_path = os.path.join('DNSMOS', 'model_v8.onnx')
|
|
106
|
+
|
|
107
|
+
if args.personalized_MOS:
|
|
108
|
+
primary_model_path = os.path.join('pDNSMOS', 'sig_bak_ovr.onnx')
|
|
109
|
+
else:
|
|
110
|
+
primary_model_path = os.path.join('DNSMOS', 'sig_bak_ovr.onnx')
|
|
111
|
+
|
|
112
|
+
compute_score = ComputeScore(primary_model_path, p808_model_path)
|
|
113
|
+
|
|
114
|
+
rows = []
|
|
115
|
+
clips = []
|
|
116
|
+
clips = glob.glob(os.path.join(args.testset_dir, "*.wav"))
|
|
117
|
+
is_personalized_eval = args.personalized_MOS
|
|
118
|
+
desired_fs = SAMPLING_RATE
|
|
119
|
+
for m in tqdm(models):
|
|
120
|
+
max_recursion_depth = 10
|
|
121
|
+
audio_path = os.path.join(args.testset_dir, m)
|
|
122
|
+
audio_clips_list = glob.glob(os.path.join(audio_path, "*.wav"))
|
|
123
|
+
while len(audio_clips_list) == 0 and max_recursion_depth > 0:
|
|
124
|
+
audio_path = os.path.join(audio_path, "**")
|
|
125
|
+
audio_clips_list = glob.glob(os.path.join(audio_path, "*.wav"))
|
|
126
|
+
max_recursion_depth -= 1
|
|
127
|
+
clips.extend(audio_clips_list)
|
|
128
|
+
|
|
129
|
+
for clip in tqdm(clips):
|
|
130
|
+
try:
|
|
131
|
+
data = compute_score(clip, desired_fs, is_personalized_eval)
|
|
132
|
+
rows.append(data)
|
|
133
|
+
except Exception as exc:
|
|
134
|
+
print('%r generated an exception: %s' % (clip, exc))
|
|
135
|
+
|
|
136
|
+
df = pd.DataFrame(rows)
|
|
137
|
+
if args.csv_path:
|
|
138
|
+
csv_path = args.csv_path
|
|
139
|
+
df.to_csv(csv_path)
|
|
140
|
+
else:
|
|
141
|
+
print(df.describe())
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
if __name__ == "__main__":
|
|
145
|
+
parser = argparse.ArgumentParser()
|
|
146
|
+
parser.add_argument('-t', "--testset_dir", default='.',
|
|
147
|
+
help='Path to the dir containing audio clips in .wav to be evaluated')
|
|
148
|
+
parser.add_argument('-o', "--csv_path", default=None, help='Dir to the csv that saves the results')
|
|
149
|
+
parser.add_argument('-p', "--personalized_MOS", action='store_true',
|
|
150
|
+
help='Flag to indicate if personalized MOS score is needed or regular')
|
|
151
|
+
|
|
152
|
+
args = parser.parse_args()
|
|
153
|
+
|
|
154
|
+
main(args)
|