visqol-python 3.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
visqol/nsim.py ADDED
@@ -0,0 +1,134 @@
1
+ """
2
+ Neurogram Similarity Index Measure (NSIM).
3
+
4
+ A variant of SSIM adapted for neurogram (spectrogram) comparison.
5
+
6
+ Corresponds to C++ files:
7
+ - neurogram_similiarity_index_measure.cc
8
+ - convolution_2d.cc
9
+ """
10
+
11
+ import numpy as np
12
+ from dataclasses import dataclass, field
13
+
14
+
15
+ # 3x3 Gaussian window weights (hardcoded from C++)
16
+ GAUSSIAN_WINDOW = np.array([
17
+ [0.0113033910173052, 0.0838251475442633, 0.0113033910173052],
18
+ [0.0838251475442633, 0.619485845753726, 0.0838251475442633],
19
+ [0.0113033910173052, 0.0838251475442633, 0.0113033910173052]
20
+ ])
21
+
22
+ # Constants for NSIM calculation
23
+ INTENSITY_RANGE = 1.0
24
+ K1 = 0.01
25
+ K2 = 0.03
26
+ C1 = (K1 * INTENSITY_RANGE) ** 2 # = 0.0001
27
+ C3 = (K2 * INTENSITY_RANGE) ** 2 / 2.0 # = 0.00045
28
+
29
+
30
+ @dataclass
31
+ class PatchSimilarityResult:
32
+ """Result of comparing a reference patch with a degraded patch."""
33
+ similarity: float = 0.0
34
+ freq_band_means: np.ndarray = field(default_factory=lambda: np.array([]))
35
+ freq_band_stddevs: np.ndarray = field(default_factory=lambda: np.array([]))
36
+ freq_band_deg_energy: np.ndarray = field(default_factory=lambda: np.array([]))
37
+
38
+ # Timing info
39
+ ref_patch_start_time: float = 0.0
40
+ ref_patch_end_time: float = 0.0
41
+ deg_patch_start_time: float = 0.0
42
+ deg_patch_end_time: float = 0.0
43
+
44
+
45
+ def _valid_2d_conv_with_boundary(kernel: np.ndarray,
46
+ matrix: np.ndarray) -> np.ndarray:
47
+ """
48
+ 2D convolution with boundary replication padding, then 'valid' convolution.
49
+
50
+ Matches C++ Convolution2D::Valid2DConvWithBoundary which:
51
+ 1. Pads matrix by 1 on each side with edge replication
52
+ 2. Performs valid convolution with REVERSED kernel
53
+
54
+ The C++ code reverses the kernel in column-major order during convolution.
55
+ Since our Gaussian kernel is symmetric (both row-symmetric and column-symmetric),
56
+ the reversal has no effect. We can use scipy's fast convolution directly.
57
+
58
+ The output has the same shape as the input matrix (edge-padded then valid conv).
59
+ """
60
+ from scipy.ndimage import convolve
61
+
62
+ # Pad matrix by 1 on each side with edge replication
63
+ padded = np.pad(matrix, pad_width=1, mode='edge')
64
+
65
+ # The C++ reverses the kernel in column-major layout.
66
+ # For the symmetric Gaussian kernel, this is equivalent to no reversal.
67
+ # Use scipy.ndimage.convolve which does correlation (no kernel flip)
68
+ # on the padded matrix, then take the valid region.
69
+
70
+ # scipy.ndimage.convolve handles the full convolution;
71
+ # we use mode='constant' with cval=0 since we already padded.
72
+ # But more efficiently: just use 'valid' equivalent by slicing.
73
+ from scipy.signal import correlate2d
74
+ result = correlate2d(padded, kernel, mode='valid')
75
+ return result
76
+
77
+
78
+ def measure_patch_similarity(ref_patch: np.ndarray,
79
+ deg_patch: np.ndarray) -> PatchSimilarityResult:
80
+ """
81
+ Compute NSIM similarity between a reference and degraded patch.
82
+
83
+ Matches C++ NeurogramSimiliarityIndexMeasure::MeasurePatchSimilarity.
84
+
85
+ Args:
86
+ ref_patch: (num_bands, num_frames) reference spectrogram patch.
87
+ deg_patch: (num_bands, num_frames) degraded spectrogram patch.
88
+
89
+ Returns:
90
+ PatchSimilarityResult with similarity score and per-band statistics.
91
+ """
92
+ w = GAUSSIAN_WINDOW
93
+
94
+ # Local means
95
+ mu_r = _valid_2d_conv_with_boundary(w, ref_patch)
96
+ mu_d = _valid_2d_conv_with_boundary(w, deg_patch)
97
+
98
+ # Squared means
99
+ ref_mu_sq = mu_r * mu_r
100
+ deg_mu_sq = mu_d * mu_d
101
+ mu_r_mu_d = mu_r * mu_d
102
+
103
+ # Variances
104
+ sigma_r_sq = _valid_2d_conv_with_boundary(w, ref_patch * ref_patch) - ref_mu_sq
105
+ sigma_d_sq = _valid_2d_conv_with_boundary(w, deg_patch * deg_patch) - deg_mu_sq
106
+ sigma_r_d = _valid_2d_conv_with_boundary(w, ref_patch * deg_patch) - mu_r_mu_d
107
+
108
+ # Intensity (luminance) component
109
+ intensity = (2.0 * mu_r_mu_d + C1) / (ref_mu_sq + deg_mu_sq + C1)
110
+
111
+ # Structure component
112
+ structure_numer = sigma_r_d + C3
113
+ var_product = sigma_r_sq * sigma_d_sq
114
+ # Handle negative variance (can occur with silent patches)
115
+ structure_denom = np.where(var_product < 0, C3, np.sqrt(var_product) + C3)
116
+ structure = structure_numer / structure_denom
117
+
118
+ # Combined similarity map
119
+ sim_map = intensity * structure
120
+
121
+ # Per-frequency-band statistics
122
+ freq_band_means = np.mean(sim_map, axis=1)
123
+ freq_band_stddevs = np.std(sim_map, axis=1, ddof=0)
124
+ freq_band_deg_energy = np.mean(deg_patch, axis=1)
125
+
126
+ # Overall similarity (mean of frequency band means)
127
+ mean_freq_band_means = np.mean(freq_band_means)
128
+
129
+ return PatchSimilarityResult(
130
+ similarity=float(mean_freq_band_means),
131
+ freq_band_means=freq_band_means,
132
+ freq_band_stddevs=freq_band_stddevs,
133
+ freq_band_deg_energy=freq_band_deg_energy,
134
+ )
@@ -0,0 +1,222 @@
1
+ """
2
+ Patch creation for reference spectrograms.
3
+ - ImagePatchCreator: evenly-spaced patches (Audio mode)
4
+ - VadPatchCreator: VAD-filtered patches (Speech mode)
5
+
6
+ Corresponds to C++ files:
7
+ - image_patch_creator.cc
8
+ - vad_patch_creator.cc
9
+ - rms_vad.cc/h
10
+ """
11
+
12
+ import numpy as np
13
+ from visqol.analysis_window import AnalysisWindow
14
+ from visqol.audio_utils import AudioSignal
15
+ from visqol.signal_utils import normalize
16
+
17
+
18
+ # ============ RMS VAD ============
19
+
20
+ class RmsVad:
21
+ """
22
+ Simple RMS-based Voice Activity Detection.
23
+ Matches C++ RmsVad class.
24
+ """
25
+ VOICE_ACTIVITY_PRESENT = 1.0
26
+ VOICE_ACTIVITY_ABSENT = 0.0
27
+ SILENT_CHUNK_COUNT = 3
28
+ RMS_THRESHOLD = 5000.0
29
+
30
+ def __init__(self):
31
+ self.each_chunk_result = []
32
+ # Initialize first (kSilentChunkCount - 1) results as voice-active
33
+ # to avoid false negatives
34
+ self.vad_results = [self.VOICE_ACTIVITY_PRESENT] * (self.SILENT_CHUNK_COUNT - 1)
35
+
36
+ def process_chunk(self, chunk: np.ndarray) -> float:
37
+ """Process a chunk of int16 samples."""
38
+ rms = np.sqrt(np.mean(chunk.astype(np.float64) ** 2))
39
+ if rms < self.RMS_THRESHOLD:
40
+ self.each_chunk_result.append(self.VOICE_ACTIVITY_ABSENT)
41
+ else:
42
+ self.each_chunk_result.append(self.VOICE_ACTIVITY_PRESENT)
43
+ return rms
44
+
45
+ def get_vad_results(self) -> list:
46
+ """Get VAD results for all processed chunks."""
47
+ for i in range(self.SILENT_CHUNK_COUNT - 1, len(self.each_chunk_result)):
48
+ if (not self.each_chunk_result[i] and
49
+ self._check_previous_chunks_for_silence(i)):
50
+ self.vad_results.append(self.VOICE_ACTIVITY_ABSENT)
51
+ else:
52
+ self.vad_results.append(self.VOICE_ACTIVITY_PRESENT)
53
+ return self.vad_results
54
+
55
+ def _check_previous_chunks_for_silence(self, idx: int) -> bool:
56
+ """Check if previous chunks are also silent."""
57
+ for j in range(1, self.SILENT_CHUNK_COUNT):
58
+ if self.each_chunk_result[idx - j] == self.VOICE_ACTIVITY_PRESENT:
59
+ return False
60
+ return True
61
+
62
+
63
+ # ============ Image Patch Creator (Audio mode) ============
64
+
65
+ class ImagePatchCreator:
66
+ """
67
+ Creates evenly-spaced patches from a spectrogram.
68
+ Used for Audio mode.
69
+ """
70
+
71
+ def __init__(self, patch_size: int):
72
+ self.patch_size = patch_size
73
+
74
+ def create_ref_patch_indices(self, spectrogram: np.ndarray,
75
+ ref_signal: AudioSignal = None,
76
+ window: AnalysisWindow = None) -> list:
77
+ """
78
+ Create reference patch indices at evenly-spaced intervals.
79
+
80
+ Matches C++ ImagePatchCreator::CreateRefPatchIndices.
81
+
82
+ Args:
83
+ spectrogram: (num_bands, num_frames) spectrogram matrix.
84
+
85
+ Returns:
86
+ List of start column indices for patches.
87
+ """
88
+ spectrum_length = spectrogram.shape[1]
89
+ init_patch_index = self.patch_size // 2
90
+
91
+ if spectrum_length < self.patch_size + init_patch_index:
92
+ raise ValueError(
93
+ f"Reference spectrum size ({spectrum_length}) smaller than "
94
+ f"minimum patch size ({self.patch_size + init_patch_index})."
95
+ )
96
+
97
+ # C++: max_index logic
98
+ if init_patch_index < (spectrum_length - self.patch_size):
99
+ max_index = spectrum_length - self.patch_size
100
+ else:
101
+ max_index = init_patch_index + 1
102
+
103
+ indices = []
104
+ i = init_patch_index
105
+ while i < max_index:
106
+ indices.append(i - 1) # C++ uses 0-based, pushes i-1
107
+ i += self.patch_size
108
+
109
+ return indices
110
+
111
+ def create_patches_from_indices(self, spectrogram: np.ndarray,
112
+ patch_indices: list) -> list:
113
+ """
114
+ Extract patches from spectrogram at given start indices.
115
+
116
+ Args:
117
+ spectrogram: (num_bands, num_frames) matrix.
118
+ patch_indices: List of start column indices.
119
+
120
+ Returns:
121
+ List of (num_bands, patch_size) patch matrices.
122
+ """
123
+ patches = []
124
+ for start_col in patch_indices:
125
+ end_col = start_col + self.patch_size
126
+ patch = spectrogram[:, start_col:end_col]
127
+ patches.append(patch)
128
+ return patches
129
+
130
+
131
+ # ============ VAD Patch Creator (Speech mode) ============
132
+
133
+ class VadPatchCreator:
134
+ """
135
+ Creates VAD-filtered patches from a spectrogram.
136
+ Used for Speech mode.
137
+ """
138
+
139
+ FRAMES_WITH_VA_THRESHOLD = 1.0
140
+
141
+ def __init__(self, patch_size: int):
142
+ self.patch_size = patch_size
143
+
144
+ def _get_voice_activity(self, signal: AudioSignal, start_sample: int,
145
+ total_samples: int, frame_len: int) -> list:
146
+ """
147
+ Get voice activity detection results for a signal segment.
148
+
149
+ Matches C++ VadPatchCreator::GetVoiceActivity.
150
+ """
151
+ rms_vad = RmsVad()
152
+ data = signal.data[start_sample:start_sample + total_samples]
153
+
154
+ # Convert to int16 range and process in chunks
155
+ frame = []
156
+ for val in data:
157
+ # Quantize to int16 range
158
+ int_val = val * (1 << 15)
159
+ int_val = max(-1.0 * (1 << 15),
160
+ min(1.0 * ((1 << 15) - 1), int_val))
161
+ frame.append(int(int_val))
162
+ if len(frame) == frame_len:
163
+ rms_vad.process_chunk(np.array(frame, dtype=np.int16))
164
+ frame = []
165
+
166
+ return rms_vad.get_vad_results()
167
+
168
+ def create_ref_patch_indices(self, spectrogram: np.ndarray,
169
+ ref_signal: AudioSignal,
170
+ window: AnalysisWindow) -> list:
171
+ """
172
+ Create VAD-filtered reference patch indices.
173
+
174
+ Matches C++ VadPatchCreator::CreateRefPatchIndices.
175
+ """
176
+ # Normalize signal
177
+ norm_data = normalize(ref_signal.data)
178
+ norm_signal = AudioSignal(norm_data, ref_signal.sample_rate)
179
+
180
+ frame_size = int(window.size * window.overlap)
181
+ patch_sample_len = self.patch_size * frame_size
182
+ spectrum_length = spectrogram.shape[1]
183
+ first_patch_idx = self.patch_size // 2 - 1
184
+ patch_count = (spectrum_length - first_patch_idx) // self.patch_size
185
+ total_sample_count = patch_count * patch_sample_len
186
+
187
+ # Get VAD results
188
+ vad_res = self._get_voice_activity(
189
+ norm_signal, first_patch_idx, total_sample_count, frame_size
190
+ )
191
+
192
+ # Filter patches based on VAD
193
+ ref_patch_indices = []
194
+ patch_idx = first_patch_idx
195
+ for i in range(patch_count):
196
+ # Slice VAD data for this patch
197
+ start = i * self.patch_size
198
+ end = start + self.patch_size
199
+ if end <= len(vad_res):
200
+ patch_vad = vad_res[start:end]
201
+ else:
202
+ patch_vad = vad_res[start:]
203
+
204
+ # Count frames with voice activity
205
+ frames_with_va = sum(patch_vad)
206
+
207
+ if frames_with_va >= self.FRAMES_WITH_VA_THRESHOLD:
208
+ ref_patch_indices.append(patch_idx)
209
+
210
+ patch_idx += self.patch_size
211
+
212
+ return ref_patch_indices
213
+
214
+ def create_patches_from_indices(self, spectrogram: np.ndarray,
215
+ patch_indices: list) -> list:
216
+ """Extract patches from spectrogram at given indices."""
217
+ patches = []
218
+ for start_col in patch_indices:
219
+ end_col = start_col + self.patch_size
220
+ patch = spectrogram[:, start_col:end_col]
221
+ patches.append(patch)
222
+ return patches