visqol-python 3.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- visqol/__init__.py +20 -0
- visqol/__main__.py +92 -0
- visqol/alignment.py +82 -0
- visqol/analysis_window.py +52 -0
- visqol/api.py +110 -0
- visqol/audio_utils.py +90 -0
- visqol/gammatone.py +418 -0
- visqol/model/libsvm_nu_svr_model.txt +324 -0
- visqol/nsim.py +134 -0
- visqol/patch_creator.py +222 -0
- visqol/patch_selector.py +357 -0
- visqol/quality_mapper.py +114 -0
- visqol/signal_utils.py +83 -0
- visqol/visqol_core.py +240 -0
- visqol/visqol_manager.py +194 -0
- visqol_python-3.3.3.dist-info/METADATA +223 -0
- visqol_python-3.3.3.dist-info/RECORD +21 -0
- visqol_python-3.3.3.dist-info/WHEEL +5 -0
- visqol_python-3.3.3.dist-info/entry_points.txt +2 -0
- visqol_python-3.3.3.dist-info/licenses/LICENSE +201 -0
- visqol_python-3.3.3.dist-info/top_level.txt +1 -0
visqol/visqol_core.py
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core ViSQOL algorithm: assembles all components into a similarity computation pipeline.
|
|
3
|
+
|
|
4
|
+
Corresponds to C++ file: visqol.cc
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import math
|
|
8
|
+
import numpy as np
|
|
9
|
+
from typing import List
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
|
|
12
|
+
from visqol.audio_utils import AudioSignal, scale_to_match_sound_pressure_level
|
|
13
|
+
from visqol.analysis_window import AnalysisWindow
|
|
14
|
+
from visqol.gammatone import (
|
|
15
|
+
GammatoneSpectrogramBuilder, Spectrogram,
|
|
16
|
+
prepare_spectrograms_for_comparison,
|
|
17
|
+
)
|
|
18
|
+
from visqol.nsim import PatchSimilarityResult, measure_patch_similarity
|
|
19
|
+
from visqol.patch_selector import (
|
|
20
|
+
find_most_optimal_deg_patches,
|
|
21
|
+
finely_align_and_recreate_patches,
|
|
22
|
+
)
|
|
23
|
+
from visqol.quality_mapper import SimilarityToQualityMapper
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class SimilarityResult:
|
|
28
|
+
"""Complete result of a ViSQOL similarity comparison."""
|
|
29
|
+
moslqo: float = 0.0
|
|
30
|
+
vnsim: float = 0.0
|
|
31
|
+
fvnsim: np.ndarray = field(default_factory=lambda: np.array([]))
|
|
32
|
+
fvnsim10: np.ndarray = field(default_factory=lambda: np.array([]))
|
|
33
|
+
fstdnsim: np.ndarray = field(default_factory=lambda: np.array([]))
|
|
34
|
+
fvdegenergy: np.ndarray = field(default_factory=lambda: np.array([]))
|
|
35
|
+
center_freq_bands: np.ndarray = field(default_factory=lambda: np.array([]))
|
|
36
|
+
patch_sims: List[PatchSimilarityResult] = field(default_factory=list)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def calc_per_patch_mean_freq_band_means(
|
|
40
|
+
sim_results: List[PatchSimilarityResult]
|
|
41
|
+
) -> np.ndarray:
|
|
42
|
+
"""
|
|
43
|
+
Calculate mean of per-frequency-band means across all patches.
|
|
44
|
+
This is fvnsim.
|
|
45
|
+
Matches C++ Visqol::CalcPerPatchMeanFreqBandMeans.
|
|
46
|
+
"""
|
|
47
|
+
all_means = np.array([r.freq_band_means for r in sim_results])
|
|
48
|
+
return np.mean(all_means, axis=0)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def calc_per_patch_freq_band_quantile(
|
|
52
|
+
sim_results: List[PatchSimilarityResult],
|
|
53
|
+
quantile: float = 0.10,
|
|
54
|
+
) -> np.ndarray:
|
|
55
|
+
"""
|
|
56
|
+
Calculate quantile of per-frequency-band means across patches.
|
|
57
|
+
This is fvnsim10.
|
|
58
|
+
Matches C++ Visqol::CalcPerPatchFreqBandQuantile.
|
|
59
|
+
"""
|
|
60
|
+
num_freq_bands = len(sim_results[0].freq_band_means)
|
|
61
|
+
result = np.zeros(num_freq_bands)
|
|
62
|
+
|
|
63
|
+
for band in range(num_freq_bands):
|
|
64
|
+
band_nsims = sorted([r.freq_band_means[band] for r in sim_results])
|
|
65
|
+
num_in_quantile = max(1, int(len(band_nsims) * quantile))
|
|
66
|
+
result[band] = np.mean(band_nsims[:num_in_quantile])
|
|
67
|
+
|
|
68
|
+
return result
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def calc_per_patch_mean_freq_band_deg_energy(
|
|
72
|
+
sim_results: List[PatchSimilarityResult]
|
|
73
|
+
) -> np.ndarray:
|
|
74
|
+
"""
|
|
75
|
+
Calculate mean of per-frequency-band degraded energy across patches.
|
|
76
|
+
This is fvdegenergy.
|
|
77
|
+
Matches C++ Visqol::CalcPerPatchMeanFreqBandDegradedEnergy.
|
|
78
|
+
"""
|
|
79
|
+
all_energy = np.array([r.freq_band_deg_energy for r in sim_results])
|
|
80
|
+
return np.mean(all_energy, axis=0)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def calc_per_patch_mean_freq_band_stddevs(
|
|
84
|
+
sim_results: List[PatchSimilarityResult],
|
|
85
|
+
frame_duration: float,
|
|
86
|
+
) -> np.ndarray:
|
|
87
|
+
"""
|
|
88
|
+
Calculate pooled standard deviation across patches.
|
|
89
|
+
This is fstdnsim.
|
|
90
|
+
Matches C++ Visqol::CalcPerPatchMeanFreqBandStdDevs.
|
|
91
|
+
|
|
92
|
+
Uses the pooled variance formula:
|
|
93
|
+
https://en.wikipedia.org/wiki/Pooled_variance
|
|
94
|
+
"""
|
|
95
|
+
num_freq_bands = len(sim_results[0].freq_band_means)
|
|
96
|
+
|
|
97
|
+
# First compute fvnsim (global mean per band)
|
|
98
|
+
fvnsim = calc_per_patch_mean_freq_band_means(sim_results)
|
|
99
|
+
|
|
100
|
+
total_frame_count = 0
|
|
101
|
+
contribution = np.zeros(num_freq_bands)
|
|
102
|
+
|
|
103
|
+
for patch in sim_results:
|
|
104
|
+
secs_in_patch = patch.ref_patch_end_time - patch.ref_patch_start_time
|
|
105
|
+
frame_count = int(math.ceil(secs_in_patch / frame_duration))
|
|
106
|
+
total_frame_count += frame_count
|
|
107
|
+
|
|
108
|
+
for band in range(num_freq_bands):
|
|
109
|
+
stddev = patch.freq_band_stddevs[band]
|
|
110
|
+
mean = patch.freq_band_means[band]
|
|
111
|
+
contribution[band] += (frame_count - 1) * stddev * stddev
|
|
112
|
+
contribution[band] += frame_count * mean * mean
|
|
113
|
+
|
|
114
|
+
if total_frame_count <= 1:
|
|
115
|
+
return np.zeros(num_freq_bands)
|
|
116
|
+
|
|
117
|
+
variance = (contribution - fvnsim * fvnsim * total_frame_count) / (
|
|
118
|
+
total_frame_count - 1
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
# sqrt, filtering negative values due to precision
|
|
122
|
+
result = np.where(variance < 0, 0.0, np.sqrt(variance))
|
|
123
|
+
return result
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def alter_for_similarity_extremes(vnsim: float, moslqo: float) -> float:
|
|
127
|
+
"""
|
|
128
|
+
Handle extreme similarity cases.
|
|
129
|
+
Matches C++ Visqol::AlterForSimilarityExtremes.
|
|
130
|
+
"""
|
|
131
|
+
if vnsim < 0.15:
|
|
132
|
+
return 1.0
|
|
133
|
+
return moslqo
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def calc_frame_duration(frame_size: int, sample_rate: int) -> float:
|
|
137
|
+
"""Calculate frame duration in seconds."""
|
|
138
|
+
return frame_size / float(sample_rate)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class VisqolCore:
|
|
142
|
+
"""
|
|
143
|
+
Core ViSQOL algorithm.
|
|
144
|
+
Orchestrates the complete similarity calculation pipeline.
|
|
145
|
+
"""
|
|
146
|
+
|
|
147
|
+
def calculate_similarity(
|
|
148
|
+
self,
|
|
149
|
+
ref_signal: AudioSignal,
|
|
150
|
+
deg_signal: AudioSignal,
|
|
151
|
+
spect_builder: GammatoneSpectrogramBuilder,
|
|
152
|
+
window: AnalysisWindow,
|
|
153
|
+
patch_creator,
|
|
154
|
+
search_window: int,
|
|
155
|
+
quality_mapper: SimilarityToQualityMapper,
|
|
156
|
+
disable_realignment: bool = False,
|
|
157
|
+
) -> SimilarityResult:
|
|
158
|
+
"""
|
|
159
|
+
Calculate full similarity between reference and degraded signals.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
ref_signal: Reference audio signal.
|
|
163
|
+
deg_signal: Degraded audio signal.
|
|
164
|
+
spect_builder: Gammatone spectrogram builder.
|
|
165
|
+
window: Analysis window.
|
|
166
|
+
patch_creator: Patch creator (ImagePatchCreator or VadPatchCreator).
|
|
167
|
+
search_window: Search window radius in patch units.
|
|
168
|
+
quality_mapper: Similarity-to-quality mapper.
|
|
169
|
+
disable_realignment: If True, skip fine realignment.
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
SimilarityResult with MOS-LQO score and all intermediate data.
|
|
173
|
+
"""
|
|
174
|
+
# Stage 1: Preprocessing - SPL matching
|
|
175
|
+
deg_signal = scale_to_match_sound_pressure_level(ref_signal, deg_signal)
|
|
176
|
+
|
|
177
|
+
# Build spectrograms
|
|
178
|
+
ref_spectrogram = spect_builder.build(ref_signal, window)
|
|
179
|
+
deg_spectrogram = spect_builder.build(deg_signal, window)
|
|
180
|
+
|
|
181
|
+
# Prepare spectrograms for comparison (dB conversion + noise floor)
|
|
182
|
+
ref_db, deg_db = prepare_spectrograms_for_comparison(
|
|
183
|
+
ref_spectrogram, deg_spectrogram
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
# Stage 2: Feature selection and similarity measure
|
|
187
|
+
ref_patch_indices = patch_creator.create_ref_patch_indices(
|
|
188
|
+
ref_db, ref_signal, window
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
frame_duration = calc_frame_duration(
|
|
192
|
+
int(window.size * window.overlap), ref_signal.sample_rate
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
ref_patches = patch_creator.create_patches_from_indices(
|
|
196
|
+
ref_db, ref_patch_indices
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# DP patch matching
|
|
200
|
+
sim_match_info = find_most_optimal_deg_patches(
|
|
201
|
+
ref_patches, ref_patch_indices, deg_db,
|
|
202
|
+
frame_duration, search_window
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
# Fine realignment
|
|
206
|
+
if not disable_realignment:
|
|
207
|
+
sim_match_info = finely_align_and_recreate_patches(
|
|
208
|
+
sim_match_info, ref_signal, deg_signal,
|
|
209
|
+
spect_builder, window
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
# Aggregate statistics
|
|
213
|
+
fvnsim = calc_per_patch_mean_freq_band_means(sim_match_info)
|
|
214
|
+
fvnsim10 = calc_per_patch_freq_band_quantile(sim_match_info, 0.10)
|
|
215
|
+
fstdnsim = calc_per_patch_mean_freq_band_stddevs(
|
|
216
|
+
sim_match_info, frame_duration
|
|
217
|
+
)
|
|
218
|
+
fvdegenergy = calc_per_patch_mean_freq_band_deg_energy(sim_match_info)
|
|
219
|
+
|
|
220
|
+
# Predict MOS
|
|
221
|
+
moslqo = quality_mapper.predict_quality(
|
|
222
|
+
fvnsim, fvnsim10, fstdnsim, fvdegenergy
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
# Calculate vnsim (mean of fvnsim)
|
|
226
|
+
vnsim = float(np.mean(fvnsim))
|
|
227
|
+
|
|
228
|
+
# Handle extreme cases
|
|
229
|
+
moslqo = alter_for_similarity_extremes(vnsim, moslqo)
|
|
230
|
+
|
|
231
|
+
return SimilarityResult(
|
|
232
|
+
moslqo=moslqo,
|
|
233
|
+
vnsim=vnsim,
|
|
234
|
+
fvnsim=fvnsim,
|
|
235
|
+
fvnsim10=fvnsim10,
|
|
236
|
+
fstdnsim=fstdnsim,
|
|
237
|
+
fvdegenergy=fvdegenergy,
|
|
238
|
+
center_freq_bands=ref_spectrogram.center_freq_bands,
|
|
239
|
+
patch_sims=sim_match_info,
|
|
240
|
+
)
|
visqol/visqol_manager.py
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ViSQOL Manager: orchestrates the complete ViSQOL workflow.
|
|
3
|
+
|
|
4
|
+
Corresponds to C++ file: visqol_manager.cc
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
from visqol.audio_utils import AudioSignal, load_as_mono
|
|
12
|
+
from visqol.analysis_window import AnalysisWindow
|
|
13
|
+
from visqol.alignment import globally_align
|
|
14
|
+
from visqol.gammatone import GammatoneSpectrogramBuilder
|
|
15
|
+
from visqol.patch_creator import ImagePatchCreator, VadPatchCreator
|
|
16
|
+
from visqol.quality_mapper import (
|
|
17
|
+
SvrSimilarityToQualityMapper,
|
|
18
|
+
SpeechSimilarityToQualityMapper,
|
|
19
|
+
)
|
|
20
|
+
from visqol.visqol_core import VisqolCore, SimilarityResult
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
# Default parameters (matching C++ VisqolManager constants)
|
|
25
|
+
PATCH_SIZE_AUDIO = 30
|
|
26
|
+
PATCH_SIZE_SPEECH = 20
|
|
27
|
+
NUM_BANDS_AUDIO = 32
|
|
28
|
+
NUM_BANDS_SPEECH = 21
|
|
29
|
+
MINIMUM_FREQ = 50.0 # Hz (wideband)
|
|
30
|
+
OVERLAP = 0.25 # 25% overlap
|
|
31
|
+
DURATION_MISMATCH_TOLERANCE = 1.0 # seconds
|
|
32
|
+
|
|
33
|
+
K_16K_SAMPLE_RATE = 16000
|
|
34
|
+
K_48K_SAMPLE_RATE = 48000
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class VisqolManager:
|
|
38
|
+
"""
|
|
39
|
+
Main manager class that configures and runs ViSQOL.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(self):
|
|
43
|
+
self.use_speech_mode = False
|
|
44
|
+
self.search_window = 60
|
|
45
|
+
self.disable_global_alignment = False
|
|
46
|
+
self.disable_realignment = False
|
|
47
|
+
self.is_initialized = False
|
|
48
|
+
|
|
49
|
+
self.spect_builder = None
|
|
50
|
+
self.patch_creator = None
|
|
51
|
+
self.quality_mapper = None
|
|
52
|
+
self.visqol_core = VisqolCore()
|
|
53
|
+
|
|
54
|
+
def init(self,
|
|
55
|
+
model_path: str = "",
|
|
56
|
+
use_speech_mode: bool = False,
|
|
57
|
+
use_unscaled_speech: bool = False,
|
|
58
|
+
search_window: int = 60,
|
|
59
|
+
disable_global_alignment: bool = False,
|
|
60
|
+
disable_realignment: bool = False):
|
|
61
|
+
"""
|
|
62
|
+
Initialize the ViSQOL manager.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
model_path: Path to SVR model file (Audio mode).
|
|
66
|
+
use_speech_mode: Use speech mode (16kHz, 21 bands).
|
|
67
|
+
use_unscaled_speech: Don't scale speech MOS to max 5.0.
|
|
68
|
+
search_window: Search window radius in patch units.
|
|
69
|
+
disable_global_alignment: Skip global alignment.
|
|
70
|
+
disable_realignment: Skip fine realignment.
|
|
71
|
+
"""
|
|
72
|
+
self.use_speech_mode = use_speech_mode
|
|
73
|
+
self.search_window = search_window
|
|
74
|
+
self.disable_global_alignment = disable_global_alignment
|
|
75
|
+
self.disable_realignment = disable_realignment
|
|
76
|
+
|
|
77
|
+
# Initialize patch creator
|
|
78
|
+
if use_speech_mode:
|
|
79
|
+
self.patch_creator = VadPatchCreator(PATCH_SIZE_SPEECH)
|
|
80
|
+
else:
|
|
81
|
+
self.patch_creator = ImagePatchCreator(PATCH_SIZE_AUDIO)
|
|
82
|
+
|
|
83
|
+
# Initialize spectrogram builder
|
|
84
|
+
if use_speech_mode:
|
|
85
|
+
self.spect_builder = GammatoneSpectrogramBuilder(
|
|
86
|
+
NUM_BANDS_SPEECH, MINIMUM_FREQ, speech_mode=True
|
|
87
|
+
)
|
|
88
|
+
else:
|
|
89
|
+
self.spect_builder = GammatoneSpectrogramBuilder(
|
|
90
|
+
NUM_BANDS_AUDIO, MINIMUM_FREQ, speech_mode=False
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# Initialize quality mapper
|
|
94
|
+
if use_speech_mode:
|
|
95
|
+
self.quality_mapper = SpeechSimilarityToQualityMapper(
|
|
96
|
+
scale_to_max_mos=not use_unscaled_speech
|
|
97
|
+
)
|
|
98
|
+
else:
|
|
99
|
+
self.quality_mapper = SvrSimilarityToQualityMapper(model_path)
|
|
100
|
+
|
|
101
|
+
self.quality_mapper.init()
|
|
102
|
+
self.is_initialized = True
|
|
103
|
+
|
|
104
|
+
def run(self, ref_path: str, deg_path: str) -> SimilarityResult:
|
|
105
|
+
"""
|
|
106
|
+
Run ViSQOL on two audio files.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
ref_path: Path to reference audio file.
|
|
110
|
+
deg_path: Path to degraded audio file.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
SimilarityResult with MOS-LQO score.
|
|
114
|
+
"""
|
|
115
|
+
if not self.is_initialized:
|
|
116
|
+
raise RuntimeError("VisqolManager must be initialized before use.")
|
|
117
|
+
|
|
118
|
+
# Load audio
|
|
119
|
+
ref_signal = load_as_mono(ref_path)
|
|
120
|
+
deg_signal = load_as_mono(deg_path)
|
|
121
|
+
|
|
122
|
+
return self.run_from_signals(ref_signal, deg_signal)
|
|
123
|
+
|
|
124
|
+
def run_from_signals(self, ref_signal: AudioSignal,
|
|
125
|
+
deg_signal: AudioSignal) -> SimilarityResult:
|
|
126
|
+
"""
|
|
127
|
+
Run ViSQOL on two audio signals.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
ref_signal: Reference audio signal.
|
|
131
|
+
deg_signal: Degraded audio signal.
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
SimilarityResult with MOS-LQO score.
|
|
135
|
+
"""
|
|
136
|
+
if not self.is_initialized:
|
|
137
|
+
raise RuntimeError("VisqolManager must be initialized before use.")
|
|
138
|
+
|
|
139
|
+
# Validate
|
|
140
|
+
self._validate_input(ref_signal, deg_signal)
|
|
141
|
+
|
|
142
|
+
# Global alignment
|
|
143
|
+
if not self.disable_global_alignment:
|
|
144
|
+
aligned_deg, lag = globally_align(ref_signal, deg_signal)
|
|
145
|
+
deg_signal = aligned_deg
|
|
146
|
+
|
|
147
|
+
# Create analysis window
|
|
148
|
+
window = AnalysisWindow(ref_signal.sample_rate, OVERLAP)
|
|
149
|
+
|
|
150
|
+
# Run core algorithm
|
|
151
|
+
return self.visqol_core.calculate_similarity(
|
|
152
|
+
ref_signal=ref_signal,
|
|
153
|
+
deg_signal=deg_signal,
|
|
154
|
+
spect_builder=self.spect_builder,
|
|
155
|
+
window=window,
|
|
156
|
+
patch_creator=self.patch_creator,
|
|
157
|
+
search_window=self.search_window,
|
|
158
|
+
quality_mapper=self.quality_mapper,
|
|
159
|
+
disable_realignment=self.disable_realignment,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
def _validate_input(self, ref_signal: AudioSignal,
|
|
163
|
+
deg_signal: AudioSignal):
|
|
164
|
+
"""Validate input audio signals."""
|
|
165
|
+
# Check duration mismatch
|
|
166
|
+
ref_dur = ref_signal.duration
|
|
167
|
+
deg_dur = deg_signal.duration
|
|
168
|
+
if abs(ref_dur - deg_dur) > DURATION_MISMATCH_TOLERANCE:
|
|
169
|
+
logger.warning(
|
|
170
|
+
"Duration mismatch: reference=%.2fs, degraded=%.2fs",
|
|
171
|
+
ref_dur, deg_dur
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
# Check sample rate match
|
|
175
|
+
if ref_signal.sample_rate != deg_signal.sample_rate:
|
|
176
|
+
raise ValueError(
|
|
177
|
+
f"Sample rate mismatch: reference={ref_signal.sample_rate}, "
|
|
178
|
+
f"degraded={deg_signal.sample_rate}"
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
if self.use_speech_mode:
|
|
182
|
+
if ref_signal.sample_rate > K_16K_SAMPLE_RATE:
|
|
183
|
+
logger.warning(
|
|
184
|
+
"Input sample rate (%d) is above 16kHz. "
|
|
185
|
+
"Consider resampling for speech mode.",
|
|
186
|
+
ref_signal.sample_rate
|
|
187
|
+
)
|
|
188
|
+
else:
|
|
189
|
+
if ref_signal.sample_rate != K_48K_SAMPLE_RATE:
|
|
190
|
+
logger.warning(
|
|
191
|
+
"Input sample rate (%d) is not 48kHz. "
|
|
192
|
+
"This may affect audio mode scoring.",
|
|
193
|
+
ref_signal.sample_rate
|
|
194
|
+
)
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: visqol-python
|
|
3
|
+
Version: 3.3.3
|
|
4
|
+
Summary: ViSQOL - Virtual Speech Quality Objective Listener (Pure Python)
|
|
5
|
+
Home-page: https://github.com/talker93/visqol-python
|
|
6
|
+
Author: Shan Jiang
|
|
7
|
+
License-Expression: Apache-2.0
|
|
8
|
+
Project-URL: Homepage, https://github.com/talker93/visqol-python
|
|
9
|
+
Project-URL: Bug Reports, https://github.com/talker93/visqol-python/issues
|
|
10
|
+
Project-URL: Source, https://github.com/talker93/visqol-python
|
|
11
|
+
Project-URL: Original C++, https://github.com/google/visqol
|
|
12
|
+
Keywords: audio-quality,speech-quality,MOS,PESQ,POLQA,visqol,objective-metric,perceptual-quality
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
23
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Analysis
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering
|
|
25
|
+
Requires-Python: >=3.8
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
License-File: LICENSE
|
|
28
|
+
Requires-Dist: numpy>=1.20
|
|
29
|
+
Requires-Dist: scipy>=1.7
|
|
30
|
+
Requires-Dist: soundfile>=0.10
|
|
31
|
+
Requires-Dist: libsvm-official>=3.25
|
|
32
|
+
Dynamic: home-page
|
|
33
|
+
Dynamic: license-file
|
|
34
|
+
Dynamic: requires-python
|
|
35
|
+
|
|
36
|
+
# ViSQOL (Python)
|
|
37
|
+
|
|
38
|
+
A pure Python implementation of [Google's ViSQOL](https://github.com/google/visqol) (Virtual Speech Quality Objective Listener) v3.3.3 for objective audio/speech quality assessment.
|
|
39
|
+
|
|
40
|
+
ViSQOL compares a reference audio signal with a degraded version and outputs a **MOS-LQO** (Mean Opinion Score - Listening Quality Objective) score on a scale of **1.0 – 5.0**.
|
|
41
|
+
|
|
42
|
+
## Features
|
|
43
|
+
|
|
44
|
+
- **Two modes**: Audio mode (music/general audio at 48 kHz) and Speech mode (speech at 16 kHz)
|
|
45
|
+
- **High accuracy**: 11/11 conformance tests pass against the official C++ implementation
|
|
46
|
+
- Audio mode: 9/10 tests produce **identical** MOS scores (diff = 0.000000), 1 test diff = 0.000117
|
|
47
|
+
- Speech mode: diff = 0.006715
|
|
48
|
+
- **Pure Python**: no C/C++ compilation required
|
|
49
|
+
- **Minimal dependencies**: only 4 pip packages (`numpy`, `scipy`, `soundfile`, `libsvm-official`)
|
|
50
|
+
- **Faster than real-time**: Audio RTF ≈ 0.71x, Speech RTF ≈ 0.38x
|
|
51
|
+
|
|
52
|
+
## Installation
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install numpy scipy soundfile libsvm-official
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Or install as a package:
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
git clone https://github.com/talker93/visqol-python.git
|
|
62
|
+
cd visqol-python
|
|
63
|
+
pip install -e .
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Quick Start
|
|
67
|
+
|
|
68
|
+
### Python API
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
from visqol import VisqolApi
|
|
72
|
+
|
|
73
|
+
# Audio mode (default) - for music and general audio
|
|
74
|
+
api = VisqolApi()
|
|
75
|
+
api.create(mode="audio")
|
|
76
|
+
result = api.measure("reference.wav", "degraded.wav")
|
|
77
|
+
print(f"MOS-LQO: {result.moslqo:.4f}")
|
|
78
|
+
|
|
79
|
+
# Speech mode - for speech signals
|
|
80
|
+
api = VisqolApi()
|
|
81
|
+
api.create(mode="speech")
|
|
82
|
+
result = api.measure("ref_speech.wav", "deg_speech.wav")
|
|
83
|
+
print(f"MOS-LQO: {result.moslqo:.4f}")
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### Using NumPy Arrays
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
import numpy as np
|
|
90
|
+
import soundfile as sf
|
|
91
|
+
from visqol import VisqolApi
|
|
92
|
+
|
|
93
|
+
ref, sr = sf.read("reference.wav")
|
|
94
|
+
deg, _ = sf.read("degraded.wav")
|
|
95
|
+
|
|
96
|
+
api = VisqolApi()
|
|
97
|
+
api.create(mode="audio")
|
|
98
|
+
result = api.measure_from_arrays(ref, deg, sample_rate=sr)
|
|
99
|
+
print(f"MOS-LQO: {result.moslqo:.4f}")
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### Command Line
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
# Audio mode (default)
|
|
106
|
+
python -m visqol -r reference.wav -d degraded.wav
|
|
107
|
+
|
|
108
|
+
# Speech mode
|
|
109
|
+
python -m visqol -r reference.wav -d degraded.wav --speech_mode
|
|
110
|
+
|
|
111
|
+
# Verbose output (per-patch details)
|
|
112
|
+
python -m visqol -r reference.wav -d degraded.wav -v
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
**CLI options:**
|
|
116
|
+
|
|
117
|
+
| Flag | Description |
|
|
118
|
+
|------|-------------|
|
|
119
|
+
| `-r`, `--reference` | Path to reference WAV file (required) |
|
|
120
|
+
| `-d`, `--degraded` | Path to degraded WAV file (required) |
|
|
121
|
+
| `--speech_mode` | Use speech mode (16 kHz, polynomial mapping) |
|
|
122
|
+
| `--model` | Custom SVR model file path (audio mode only) |
|
|
123
|
+
| `--search_window` | Search window radius (default: 60) |
|
|
124
|
+
| `--verbose`, `-v` | Show detailed per-patch results |
|
|
125
|
+
|
|
126
|
+
## Output
|
|
127
|
+
|
|
128
|
+
The `measure()` method returns a `SimilarityResult` object with:
|
|
129
|
+
|
|
130
|
+
| Field | Description |
|
|
131
|
+
|-------|-------------|
|
|
132
|
+
| `moslqo` | MOS-LQO score (1.0 – 5.0) |
|
|
133
|
+
| `vnsim` | Mean NSIM across all patches |
|
|
134
|
+
| `fvnsim` | Per-frequency-band mean NSIM |
|
|
135
|
+
| `fstdnsim` | Per-frequency-band std of NSIM |
|
|
136
|
+
| `fvdegenergy` | Per-frequency-band degraded energy |
|
|
137
|
+
| `patch_sims` | List of per-patch similarity details |
|
|
138
|
+
|
|
139
|
+
## Modes
|
|
140
|
+
|
|
141
|
+
### Audio Mode (default)
|
|
142
|
+
- Target sample rate: **48 kHz**
|
|
143
|
+
- 32 Gammatone frequency bands (50 Hz – 15 000 Hz)
|
|
144
|
+
- Quality mapping: SVR (Support Vector Regression) model
|
|
145
|
+
- Best for: music, environmental audio, codecs
|
|
146
|
+
|
|
147
|
+
### Speech Mode
|
|
148
|
+
- Target sample rate: **16 kHz**
|
|
149
|
+
- 32 Gammatone frequency bands (50 Hz – 8 000 Hz)
|
|
150
|
+
- Quality mapping: exponential polynomial fit
|
|
151
|
+
- VAD (Voice Activity Detection) based patch selection
|
|
152
|
+
- Best for: speech, VoIP, telephony
|
|
153
|
+
|
|
154
|
+
## Performance
|
|
155
|
+
|
|
156
|
+
Measured on Apple M-series, Python 3.13:
|
|
157
|
+
|
|
158
|
+
| Mode | Avg RTF | Typical Time |
|
|
159
|
+
|------|---------|-------------|
|
|
160
|
+
| Audio (48 kHz) | **0.71x** | 7 – 12 s per file pair |
|
|
161
|
+
| Speech (16 kHz) | **0.38x** | ~1 s per file pair |
|
|
162
|
+
|
|
163
|
+
> RTF (Real-Time Factor) < 1.0 means faster than real-time.
|
|
164
|
+
|
|
165
|
+
## Project Structure
|
|
166
|
+
|
|
167
|
+
```
|
|
168
|
+
visqol-python/
|
|
169
|
+
├── visqol/ # Main package
|
|
170
|
+
│ ├── __init__.py # Package exports
|
|
171
|
+
│ ├── api.py # Public API
|
|
172
|
+
│ ├── visqol_manager.py # Pipeline orchestrator
|
|
173
|
+
│ ├── visqol_core.py # Core algorithm
|
|
174
|
+
│ ├── audio_utils.py # Audio I/O & SPL normalization
|
|
175
|
+
│ ├── signal_utils.py # Envelope, cross-correlation
|
|
176
|
+
│ ├── analysis_window.py # Hann window
|
|
177
|
+
│ ├── gammatone.py # ERB + Gammatone filterbank + spectrogram
|
|
178
|
+
│ ├── patch_creator.py # Patch creation (Image + VAD modes)
|
|
179
|
+
│ ├── patch_selector.py # DP-based optimal patch matching
|
|
180
|
+
│ ├── alignment.py # Global alignment via cross-correlation
|
|
181
|
+
│ ├── nsim.py # NSIM similarity metric
|
|
182
|
+
│ ├── quality_mapper.py # SVR & exponential quality mapping
|
|
183
|
+
│ └── __main__.py # CLI entry point
|
|
184
|
+
├── model/ # Bundled SVR model
|
|
185
|
+
│ └── libsvm_nu_svr_model.txt
|
|
186
|
+
├── tests/ # Conformance tests
|
|
187
|
+
│ ├── test_conformance.py
|
|
188
|
+
│ └── test_quick.py
|
|
189
|
+
├── setup.py
|
|
190
|
+
├── requirements.txt
|
|
191
|
+
├── LICENSE
|
|
192
|
+
└── README.md
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
## Conformance Test Results
|
|
196
|
+
|
|
197
|
+
Tested against the [official C++ ViSQOL v3.3.3](https://github.com/google/visqol) expected values:
|
|
198
|
+
|
|
199
|
+
| Test Case | Mode | Expected MOS | Python MOS | Δ |
|
|
200
|
+
|-----------|------|-------------|------------|---|
|
|
201
|
+
| strauss_lp35 | Audio | 1.3889 | 1.3889 | 0.000000 |
|
|
202
|
+
| steely_lp7 | Audio | 2.2502 | 2.2502 | 0.000000 |
|
|
203
|
+
| sopr_256aac | Audio | 4.6823 | 4.6823 | 0.000000 |
|
|
204
|
+
| ravel_128opus | Audio | 4.4651 | 4.4651 | 0.000000 |
|
|
205
|
+
| moonlight_128aac | Audio | 4.6843 | 4.6843 | 0.000000 |
|
|
206
|
+
| harpsichord_96mp3 | Audio | 4.2237 | 4.2237 | 0.000000 |
|
|
207
|
+
| guitar_64aac | Audio | 4.3497 | 4.3497 | 0.000000 |
|
|
208
|
+
| glock_48aac | Audio | 4.3325 | 4.3325 | 0.000000 |
|
|
209
|
+
| contrabassoon_24aac | Audio | 2.3469 | 2.3468 | 0.000117 |
|
|
210
|
+
| castanets_identity | Audio | 4.7321 | 4.7321 | 0.000000 |
|
|
211
|
+
| speech_CA01 | Speech | 3.3745 | 3.3678 | 0.006715 |
|
|
212
|
+
|
|
213
|
+
## References
|
|
214
|
+
|
|
215
|
+
- [Google ViSQOL (C++)](https://github.com/google/visqol) — the original implementation this project is ported from
|
|
216
|
+
- Hines, A., Gillen, E., Kelly, D., Skoglund, J., Kokaram, A., & Harte, N. (2015). *ViSQOLAudio: An Objective Audio Quality Metric for Low Bitrate Codecs.* The Journal of the Acoustical Society of America.
|
|
217
|
+
- Chinen, M., Lim, F. S., Skoglund, J., Gureev, N., O'Gorman, F., & Hines, A. (2020). *ViSQOL v3: An Open Source Production Ready Objective Speech and Audio Metric.* 2020 Twelfth International Conference on Quality of Multimedia Experience (QoMEX).
|
|
218
|
+
|
|
219
|
+
## License
|
|
220
|
+
|
|
221
|
+
Apache License 2.0. See [LICENSE](LICENSE) for details.
|
|
222
|
+
|
|
223
|
+
This project is a Python port of [Google's ViSQOL](https://github.com/google/visqol), which is also licensed under Apache 2.0.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
visqol/__init__.py,sha256=0yqtDr50SFgAI67HA6i83B97YPvMEsjhQcYbimV6YfM,539
|
|
2
|
+
visqol/__main__.py,sha256=z3vXtZBKrQEnoUmvxxws58swBQuTY7CBHi5unHSSCCE,2776
|
|
3
|
+
visqol/alignment.py,sha256=I78OZzQ4N-zlr60WYQlYNVPDe-hFYzWfXIZGw2yfx0s,2664
|
|
4
|
+
visqol/analysis_window.py,sha256=j2P9ppg1PVDY5hY7c_mlLlbWn5-zePtsKO5NmQPHYuk,1659
|
|
5
|
+
visqol/api.py,sha256=pLqpThyr0ndq-e3UARv8FBgPJ7kN_u8FAafpu5i_078,3828
|
|
6
|
+
visqol/audio_utils.py,sha256=6iNrVJjKpxj7xd2LzbbLbvg0jdaSHuyhO4BJ7qR5kUY,2479
|
|
7
|
+
visqol/gammatone.py,sha256=fuM0zdmW5DLPxBa-iF_RamlQpPt6yClquSND6PEuBoA,13228
|
|
8
|
+
visqol/nsim.py,sha256=64qmZ6dHbSw1y_COJIcRK2YNcfJA4wNDopHcVLkEKHk,4789
|
|
9
|
+
visqol/patch_creator.py,sha256=bu6bKmBtUWn6Co9XiHsnnfQPjNmwaFhkMRB46ZlWahI,7591
|
|
10
|
+
visqol/patch_selector.py,sha256=p4YsV0qEaa54Eif3ENoUhtlmyEIOb5FixEdpT-rWXtw,12813
|
|
11
|
+
visqol/quality_mapper.py,sha256=0qgdIzwnwIqhsTScz2_6EW9hFZpUa_0-IeQnLqnhw30,3831
|
|
12
|
+
visqol/signal_utils.py,sha256=q3QfLb2Cy2uTYXMy9skgiASYKJe5klTrDDg8rWMalsI,2495
|
|
13
|
+
visqol/visqol_core.py,sha256=I9Qp2fzVe6n203t3HHPIuXThIM2PVXg5XfrQUH74PZE,8034
|
|
14
|
+
visqol/visqol_manager.py,sha256=roUk9hD5o613lUQ0kaxxQ3fwU4JOqqNz6OmfpN5vHfM,6527
|
|
15
|
+
visqol/model/libsvm_nu_svr_model.txt,sha256=HoJG7TO_NtxchZNR9xEPLNMfmGYZiXFcD8-XTsSNPi4,138117
|
|
16
|
+
visqol_python-3.3.3.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
17
|
+
visqol_python-3.3.3.dist-info/METADATA,sha256=V90XyFp1SnRj28t3BhYOVtXPe3IE7lnrzxKwGspcuvU,8217
|
|
18
|
+
visqol_python-3.3.3.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
19
|
+
visqol_python-3.3.3.dist-info/entry_points.txt,sha256=Dl7miY_7U-116DDs7JJsrD_XhQZf5r9nfdm4N3ocpBQ,48
|
|
20
|
+
visqol_python-3.3.3.dist-info/top_level.txt,sha256=JAzbSsJqgc6Ol-wc6r_xHonHsWVM-200RFAwuSMp0EY,7
|
|
21
|
+
visqol_python-3.3.3.dist-info/RECORD,,
|