visqol-python 3.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- visqol/__init__.py +20 -0
- visqol/__main__.py +92 -0
- visqol/alignment.py +82 -0
- visqol/analysis_window.py +52 -0
- visqol/api.py +110 -0
- visqol/audio_utils.py +90 -0
- visqol/gammatone.py +418 -0
- visqol/model/libsvm_nu_svr_model.txt +324 -0
- visqol/nsim.py +134 -0
- visqol/patch_creator.py +222 -0
- visqol/patch_selector.py +357 -0
- visqol/quality_mapper.py +114 -0
- visqol/signal_utils.py +83 -0
- visqol/visqol_core.py +240 -0
- visqol/visqol_manager.py +194 -0
- visqol_python-3.3.3.dist-info/METADATA +223 -0
- visqol_python-3.3.3.dist-info/RECORD +21 -0
- visqol_python-3.3.3.dist-info/WHEEL +5 -0
- visqol_python-3.3.3.dist-info/entry_points.txt +2 -0
- visqol_python-3.3.3.dist-info/licenses/LICENSE +201 -0
- visqol_python-3.3.3.dist-info/top_level.txt +1 -0
visqol/nsim.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Neurogram Similarity Index Measure (NSIM).
|
|
3
|
+
|
|
4
|
+
A variant of SSIM adapted for neurogram (spectrogram) comparison.
|
|
5
|
+
|
|
6
|
+
Corresponds to C++ files:
|
|
7
|
+
- neurogram_similiarity_index_measure.cc
|
|
8
|
+
- convolution_2d.cc
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# 3x3 Gaussian window weights (hardcoded from C++)
|
|
16
|
+
GAUSSIAN_WINDOW = np.array([
|
|
17
|
+
[0.0113033910173052, 0.0838251475442633, 0.0113033910173052],
|
|
18
|
+
[0.0838251475442633, 0.619485845753726, 0.0838251475442633],
|
|
19
|
+
[0.0113033910173052, 0.0838251475442633, 0.0113033910173052]
|
|
20
|
+
])
|
|
21
|
+
|
|
22
|
+
# Constants for NSIM calculation
|
|
23
|
+
INTENSITY_RANGE = 1.0
|
|
24
|
+
K1 = 0.01
|
|
25
|
+
K2 = 0.03
|
|
26
|
+
C1 = (K1 * INTENSITY_RANGE) ** 2 # = 0.0001
|
|
27
|
+
C3 = (K2 * INTENSITY_RANGE) ** 2 / 2.0 # = 0.00045
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class PatchSimilarityResult:
|
|
32
|
+
"""Result of comparing a reference patch with a degraded patch."""
|
|
33
|
+
similarity: float = 0.0
|
|
34
|
+
freq_band_means: np.ndarray = field(default_factory=lambda: np.array([]))
|
|
35
|
+
freq_band_stddevs: np.ndarray = field(default_factory=lambda: np.array([]))
|
|
36
|
+
freq_band_deg_energy: np.ndarray = field(default_factory=lambda: np.array([]))
|
|
37
|
+
|
|
38
|
+
# Timing info
|
|
39
|
+
ref_patch_start_time: float = 0.0
|
|
40
|
+
ref_patch_end_time: float = 0.0
|
|
41
|
+
deg_patch_start_time: float = 0.0
|
|
42
|
+
deg_patch_end_time: float = 0.0
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _valid_2d_conv_with_boundary(kernel: np.ndarray,
|
|
46
|
+
matrix: np.ndarray) -> np.ndarray:
|
|
47
|
+
"""
|
|
48
|
+
2D convolution with boundary replication padding, then 'valid' convolution.
|
|
49
|
+
|
|
50
|
+
Matches C++ Convolution2D::Valid2DConvWithBoundary which:
|
|
51
|
+
1. Pads matrix by 1 on each side with edge replication
|
|
52
|
+
2. Performs valid convolution with REVERSED kernel
|
|
53
|
+
|
|
54
|
+
The C++ code reverses the kernel in column-major order during convolution.
|
|
55
|
+
Since our Gaussian kernel is symmetric (both row-symmetric and column-symmetric),
|
|
56
|
+
the reversal has no effect. We can use scipy's fast convolution directly.
|
|
57
|
+
|
|
58
|
+
The output has the same shape as the input matrix (edge-padded then valid conv).
|
|
59
|
+
"""
|
|
60
|
+
from scipy.ndimage import convolve
|
|
61
|
+
|
|
62
|
+
# Pad matrix by 1 on each side with edge replication
|
|
63
|
+
padded = np.pad(matrix, pad_width=1, mode='edge')
|
|
64
|
+
|
|
65
|
+
# The C++ reverses the kernel in column-major layout.
|
|
66
|
+
# For the symmetric Gaussian kernel, this is equivalent to no reversal.
|
|
67
|
+
# Use scipy.ndimage.convolve which does correlation (no kernel flip)
|
|
68
|
+
# on the padded matrix, then take the valid region.
|
|
69
|
+
|
|
70
|
+
# scipy.ndimage.convolve handles the full convolution;
|
|
71
|
+
# we use mode='constant' with cval=0 since we already padded.
|
|
72
|
+
# But more efficiently: just use 'valid' equivalent by slicing.
|
|
73
|
+
from scipy.signal import correlate2d
|
|
74
|
+
result = correlate2d(padded, kernel, mode='valid')
|
|
75
|
+
return result
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def measure_patch_similarity(ref_patch: np.ndarray,
|
|
79
|
+
deg_patch: np.ndarray) -> PatchSimilarityResult:
|
|
80
|
+
"""
|
|
81
|
+
Compute NSIM similarity between a reference and degraded patch.
|
|
82
|
+
|
|
83
|
+
Matches C++ NeurogramSimiliarityIndexMeasure::MeasurePatchSimilarity.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
ref_patch: (num_bands, num_frames) reference spectrogram patch.
|
|
87
|
+
deg_patch: (num_bands, num_frames) degraded spectrogram patch.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
PatchSimilarityResult with similarity score and per-band statistics.
|
|
91
|
+
"""
|
|
92
|
+
w = GAUSSIAN_WINDOW
|
|
93
|
+
|
|
94
|
+
# Local means
|
|
95
|
+
mu_r = _valid_2d_conv_with_boundary(w, ref_patch)
|
|
96
|
+
mu_d = _valid_2d_conv_with_boundary(w, deg_patch)
|
|
97
|
+
|
|
98
|
+
# Squared means
|
|
99
|
+
ref_mu_sq = mu_r * mu_r
|
|
100
|
+
deg_mu_sq = mu_d * mu_d
|
|
101
|
+
mu_r_mu_d = mu_r * mu_d
|
|
102
|
+
|
|
103
|
+
# Variances
|
|
104
|
+
sigma_r_sq = _valid_2d_conv_with_boundary(w, ref_patch * ref_patch) - ref_mu_sq
|
|
105
|
+
sigma_d_sq = _valid_2d_conv_with_boundary(w, deg_patch * deg_patch) - deg_mu_sq
|
|
106
|
+
sigma_r_d = _valid_2d_conv_with_boundary(w, ref_patch * deg_patch) - mu_r_mu_d
|
|
107
|
+
|
|
108
|
+
# Intensity (luminance) component
|
|
109
|
+
intensity = (2.0 * mu_r_mu_d + C1) / (ref_mu_sq + deg_mu_sq + C1)
|
|
110
|
+
|
|
111
|
+
# Structure component
|
|
112
|
+
structure_numer = sigma_r_d + C3
|
|
113
|
+
var_product = sigma_r_sq * sigma_d_sq
|
|
114
|
+
# Handle negative variance (can occur with silent patches)
|
|
115
|
+
structure_denom = np.where(var_product < 0, C3, np.sqrt(var_product) + C3)
|
|
116
|
+
structure = structure_numer / structure_denom
|
|
117
|
+
|
|
118
|
+
# Combined similarity map
|
|
119
|
+
sim_map = intensity * structure
|
|
120
|
+
|
|
121
|
+
# Per-frequency-band statistics
|
|
122
|
+
freq_band_means = np.mean(sim_map, axis=1)
|
|
123
|
+
freq_band_stddevs = np.std(sim_map, axis=1, ddof=0)
|
|
124
|
+
freq_band_deg_energy = np.mean(deg_patch, axis=1)
|
|
125
|
+
|
|
126
|
+
# Overall similarity (mean of frequency band means)
|
|
127
|
+
mean_freq_band_means = np.mean(freq_band_means)
|
|
128
|
+
|
|
129
|
+
return PatchSimilarityResult(
|
|
130
|
+
similarity=float(mean_freq_band_means),
|
|
131
|
+
freq_band_means=freq_band_means,
|
|
132
|
+
freq_band_stddevs=freq_band_stddevs,
|
|
133
|
+
freq_band_deg_energy=freq_band_deg_energy,
|
|
134
|
+
)
|
visqol/patch_creator.py
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Patch creation for reference spectrograms.
|
|
3
|
+
- ImagePatchCreator: evenly-spaced patches (Audio mode)
|
|
4
|
+
- VadPatchCreator: VAD-filtered patches (Speech mode)
|
|
5
|
+
|
|
6
|
+
Corresponds to C++ files:
|
|
7
|
+
- image_patch_creator.cc
|
|
8
|
+
- vad_patch_creator.cc
|
|
9
|
+
- rms_vad.cc/h
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
from visqol.analysis_window import AnalysisWindow
|
|
14
|
+
from visqol.audio_utils import AudioSignal
|
|
15
|
+
from visqol.signal_utils import normalize
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# ============ RMS VAD ============
|
|
19
|
+
|
|
20
|
+
class RmsVad:
|
|
21
|
+
"""
|
|
22
|
+
Simple RMS-based Voice Activity Detection.
|
|
23
|
+
Matches C++ RmsVad class.
|
|
24
|
+
"""
|
|
25
|
+
VOICE_ACTIVITY_PRESENT = 1.0
|
|
26
|
+
VOICE_ACTIVITY_ABSENT = 0.0
|
|
27
|
+
SILENT_CHUNK_COUNT = 3
|
|
28
|
+
RMS_THRESHOLD = 5000.0
|
|
29
|
+
|
|
30
|
+
def __init__(self):
|
|
31
|
+
self.each_chunk_result = []
|
|
32
|
+
# Initialize first (kSilentChunkCount - 1) results as voice-active
|
|
33
|
+
# to avoid false negatives
|
|
34
|
+
self.vad_results = [self.VOICE_ACTIVITY_PRESENT] * (self.SILENT_CHUNK_COUNT - 1)
|
|
35
|
+
|
|
36
|
+
def process_chunk(self, chunk: np.ndarray) -> float:
|
|
37
|
+
"""Process a chunk of int16 samples."""
|
|
38
|
+
rms = np.sqrt(np.mean(chunk.astype(np.float64) ** 2))
|
|
39
|
+
if rms < self.RMS_THRESHOLD:
|
|
40
|
+
self.each_chunk_result.append(self.VOICE_ACTIVITY_ABSENT)
|
|
41
|
+
else:
|
|
42
|
+
self.each_chunk_result.append(self.VOICE_ACTIVITY_PRESENT)
|
|
43
|
+
return rms
|
|
44
|
+
|
|
45
|
+
def get_vad_results(self) -> list:
|
|
46
|
+
"""Get VAD results for all processed chunks."""
|
|
47
|
+
for i in range(self.SILENT_CHUNK_COUNT - 1, len(self.each_chunk_result)):
|
|
48
|
+
if (not self.each_chunk_result[i] and
|
|
49
|
+
self._check_previous_chunks_for_silence(i)):
|
|
50
|
+
self.vad_results.append(self.VOICE_ACTIVITY_ABSENT)
|
|
51
|
+
else:
|
|
52
|
+
self.vad_results.append(self.VOICE_ACTIVITY_PRESENT)
|
|
53
|
+
return self.vad_results
|
|
54
|
+
|
|
55
|
+
def _check_previous_chunks_for_silence(self, idx: int) -> bool:
|
|
56
|
+
"""Check if previous chunks are also silent."""
|
|
57
|
+
for j in range(1, self.SILENT_CHUNK_COUNT):
|
|
58
|
+
if self.each_chunk_result[idx - j] == self.VOICE_ACTIVITY_PRESENT:
|
|
59
|
+
return False
|
|
60
|
+
return True
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# ============ Image Patch Creator (Audio mode) ============
|
|
64
|
+
|
|
65
|
+
class ImagePatchCreator:
|
|
66
|
+
"""
|
|
67
|
+
Creates evenly-spaced patches from a spectrogram.
|
|
68
|
+
Used for Audio mode.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
def __init__(self, patch_size: int):
|
|
72
|
+
self.patch_size = patch_size
|
|
73
|
+
|
|
74
|
+
def create_ref_patch_indices(self, spectrogram: np.ndarray,
|
|
75
|
+
ref_signal: AudioSignal = None,
|
|
76
|
+
window: AnalysisWindow = None) -> list:
|
|
77
|
+
"""
|
|
78
|
+
Create reference patch indices at evenly-spaced intervals.
|
|
79
|
+
|
|
80
|
+
Matches C++ ImagePatchCreator::CreateRefPatchIndices.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
spectrogram: (num_bands, num_frames) spectrogram matrix.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
List of start column indices for patches.
|
|
87
|
+
"""
|
|
88
|
+
spectrum_length = spectrogram.shape[1]
|
|
89
|
+
init_patch_index = self.patch_size // 2
|
|
90
|
+
|
|
91
|
+
if spectrum_length < self.patch_size + init_patch_index:
|
|
92
|
+
raise ValueError(
|
|
93
|
+
f"Reference spectrum size ({spectrum_length}) smaller than "
|
|
94
|
+
f"minimum patch size ({self.patch_size + init_patch_index})."
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# C++: max_index logic
|
|
98
|
+
if init_patch_index < (spectrum_length - self.patch_size):
|
|
99
|
+
max_index = spectrum_length - self.patch_size
|
|
100
|
+
else:
|
|
101
|
+
max_index = init_patch_index + 1
|
|
102
|
+
|
|
103
|
+
indices = []
|
|
104
|
+
i = init_patch_index
|
|
105
|
+
while i < max_index:
|
|
106
|
+
indices.append(i - 1) # C++ uses 0-based, pushes i-1
|
|
107
|
+
i += self.patch_size
|
|
108
|
+
|
|
109
|
+
return indices
|
|
110
|
+
|
|
111
|
+
def create_patches_from_indices(self, spectrogram: np.ndarray,
|
|
112
|
+
patch_indices: list) -> list:
|
|
113
|
+
"""
|
|
114
|
+
Extract patches from spectrogram at given start indices.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
spectrogram: (num_bands, num_frames) matrix.
|
|
118
|
+
patch_indices: List of start column indices.
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
List of (num_bands, patch_size) patch matrices.
|
|
122
|
+
"""
|
|
123
|
+
patches = []
|
|
124
|
+
for start_col in patch_indices:
|
|
125
|
+
end_col = start_col + self.patch_size
|
|
126
|
+
patch = spectrogram[:, start_col:end_col]
|
|
127
|
+
patches.append(patch)
|
|
128
|
+
return patches
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
# ============ VAD Patch Creator (Speech mode) ============
|
|
132
|
+
|
|
133
|
+
class VadPatchCreator:
|
|
134
|
+
"""
|
|
135
|
+
Creates VAD-filtered patches from a spectrogram.
|
|
136
|
+
Used for Speech mode.
|
|
137
|
+
"""
|
|
138
|
+
|
|
139
|
+
FRAMES_WITH_VA_THRESHOLD = 1.0
|
|
140
|
+
|
|
141
|
+
def __init__(self, patch_size: int):
|
|
142
|
+
self.patch_size = patch_size
|
|
143
|
+
|
|
144
|
+
def _get_voice_activity(self, signal: AudioSignal, start_sample: int,
|
|
145
|
+
total_samples: int, frame_len: int) -> list:
|
|
146
|
+
"""
|
|
147
|
+
Get voice activity detection results for a signal segment.
|
|
148
|
+
|
|
149
|
+
Matches C++ VadPatchCreator::GetVoiceActivity.
|
|
150
|
+
"""
|
|
151
|
+
rms_vad = RmsVad()
|
|
152
|
+
data = signal.data[start_sample:start_sample + total_samples]
|
|
153
|
+
|
|
154
|
+
# Convert to int16 range and process in chunks
|
|
155
|
+
frame = []
|
|
156
|
+
for val in data:
|
|
157
|
+
# Quantize to int16 range
|
|
158
|
+
int_val = val * (1 << 15)
|
|
159
|
+
int_val = max(-1.0 * (1 << 15),
|
|
160
|
+
min(1.0 * ((1 << 15) - 1), int_val))
|
|
161
|
+
frame.append(int(int_val))
|
|
162
|
+
if len(frame) == frame_len:
|
|
163
|
+
rms_vad.process_chunk(np.array(frame, dtype=np.int16))
|
|
164
|
+
frame = []
|
|
165
|
+
|
|
166
|
+
return rms_vad.get_vad_results()
|
|
167
|
+
|
|
168
|
+
def create_ref_patch_indices(self, spectrogram: np.ndarray,
|
|
169
|
+
ref_signal: AudioSignal,
|
|
170
|
+
window: AnalysisWindow) -> list:
|
|
171
|
+
"""
|
|
172
|
+
Create VAD-filtered reference patch indices.
|
|
173
|
+
|
|
174
|
+
Matches C++ VadPatchCreator::CreateRefPatchIndices.
|
|
175
|
+
"""
|
|
176
|
+
# Normalize signal
|
|
177
|
+
norm_data = normalize(ref_signal.data)
|
|
178
|
+
norm_signal = AudioSignal(norm_data, ref_signal.sample_rate)
|
|
179
|
+
|
|
180
|
+
frame_size = int(window.size * window.overlap)
|
|
181
|
+
patch_sample_len = self.patch_size * frame_size
|
|
182
|
+
spectrum_length = spectrogram.shape[1]
|
|
183
|
+
first_patch_idx = self.patch_size // 2 - 1
|
|
184
|
+
patch_count = (spectrum_length - first_patch_idx) // self.patch_size
|
|
185
|
+
total_sample_count = patch_count * patch_sample_len
|
|
186
|
+
|
|
187
|
+
# Get VAD results
|
|
188
|
+
vad_res = self._get_voice_activity(
|
|
189
|
+
norm_signal, first_patch_idx, total_sample_count, frame_size
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
# Filter patches based on VAD
|
|
193
|
+
ref_patch_indices = []
|
|
194
|
+
patch_idx = first_patch_idx
|
|
195
|
+
for i in range(patch_count):
|
|
196
|
+
# Slice VAD data for this patch
|
|
197
|
+
start = i * self.patch_size
|
|
198
|
+
end = start + self.patch_size
|
|
199
|
+
if end <= len(vad_res):
|
|
200
|
+
patch_vad = vad_res[start:end]
|
|
201
|
+
else:
|
|
202
|
+
patch_vad = vad_res[start:]
|
|
203
|
+
|
|
204
|
+
# Count frames with voice activity
|
|
205
|
+
frames_with_va = sum(patch_vad)
|
|
206
|
+
|
|
207
|
+
if frames_with_va >= self.FRAMES_WITH_VA_THRESHOLD:
|
|
208
|
+
ref_patch_indices.append(patch_idx)
|
|
209
|
+
|
|
210
|
+
patch_idx += self.patch_size
|
|
211
|
+
|
|
212
|
+
return ref_patch_indices
|
|
213
|
+
|
|
214
|
+
def create_patches_from_indices(self, spectrogram: np.ndarray,
|
|
215
|
+
patch_indices: list) -> list:
|
|
216
|
+
"""Extract patches from spectrogram at given indices."""
|
|
217
|
+
patches = []
|
|
218
|
+
for start_col in patch_indices:
|
|
219
|
+
end_col = start_col + self.patch_size
|
|
220
|
+
patch = spectrogram[:, start_col:end_col]
|
|
221
|
+
patches.append(patch)
|
|
222
|
+
return patches
|