visqol-python 3.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
visqol/gammatone.py ADDED
@@ -0,0 +1,418 @@
1
+ """
2
+ Gammatone filterbank, ERB coefficient computation, and spectrogram builder.
3
+
4
+ Corresponds to C++ files:
5
+ - equivalent_rectangular_bandwidth.cc
6
+ - gammatone_filterbank.cc
7
+ - gammatone_spectrogram_builder.cc
8
+ - signal_filter.cc
9
+ """
10
+
11
+ import logging
12
+ import numpy as np
13
+ from scipy.signal import lfilter
14
+
15
+ from visqol.analysis_window import AnalysisWindow
16
+ from visqol.audio_utils import AudioSignal
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # Glasberg and Moore Parameters
21
+ EAR_Q = 9.26449
22
+ MIN_BW = 24.7
23
+ ORDER = 1.0
24
+
25
+ # Speech mode max frequency
26
+ SPEECH_MODE_MAX_FREQ = 8000.0
27
+
28
+
29
+ class ErbFiltersResult:
30
+ """Result of ERB filter coefficient computation."""
31
+
32
+ def __init__(self, center_freqs: np.ndarray, filter_coeffs: np.ndarray):
33
+ """
34
+ Args:
35
+ center_freqs: Array of center frequencies (num_channels,).
36
+ filter_coeffs: Coefficient matrix (10, num_channels).
37
+ Rows: A0, A11, A12, A13, A14, A2, B0, B1, B2, gain
38
+ """
39
+ self.center_freqs = center_freqs
40
+ self.filter_coeffs = filter_coeffs
41
+
42
+
43
+ def calc_center_freqs(low_freq: float, high_freq: float,
44
+ num_channels: int) -> np.ndarray:
45
+ """
46
+ Compute uniformly-spaced center frequencies on ERB scale.
47
+ Equivalent to Slaney's ERBSpace function.
48
+
49
+ Args:
50
+ low_freq: Lowest center frequency.
51
+ high_freq: Highest center frequency.
52
+ num_channels: Number of frequency channels.
53
+
54
+ Returns:
55
+ Array of center frequencies (num_channels,).
56
+ """
57
+ a = -(EAR_Q * MIN_BW)
58
+ b = -np.log(high_freq + EAR_Q * MIN_BW)
59
+ c = np.log(low_freq + EAR_Q * MIN_BW)
60
+ d = high_freq + EAR_Q * MIN_BW
61
+ e = (b + c) / num_channels
62
+
63
+ cfs = a + np.exp(np.arange(1, num_channels + 1) * e) * d
64
+ return cfs
65
+
66
+
67
+ def make_erb_filters(sample_rate: int, num_channels: int,
68
+ low_freq: float, high_freq: float) -> ErbFiltersResult:
69
+ """
70
+ Compute ERB gammatone filter coefficients.
71
+ Python port of EquivalentRectangularBandwidth::MakeFilters.
72
+
73
+ Args:
74
+ sample_rate: Sample rate in Hz.
75
+ num_channels: Number of filter channels.
76
+ low_freq: Lowest center frequency.
77
+ high_freq: Highest center frequency.
78
+
79
+ Returns:
80
+ ErbFiltersResult containing center freqs and filter coefficients.
81
+ """
82
+ if high_freq > sample_rate / 2.0:
83
+ logger.warning(
84
+ "high_freq (%.1f) >= sample_rate/2 (%.1f), falling back to sample_rate/2",
85
+ high_freq, sample_rate / 2.0
86
+ )
87
+ high_freq = sample_rate / 2.0
88
+
89
+ cf = calc_center_freqs(low_freq, high_freq, num_channels)
90
+ T = 1.0 / sample_rate
91
+
92
+ # ERB bandwidth
93
+ erb = ((cf / EAR_Q) ** ORDER + MIN_BW ** ORDER) ** (1.0 / ORDER)
94
+ B = 1.019 * 2.0 * np.pi * erb
95
+
96
+ # Filter coefficients
97
+ expBT = np.exp(B * T)
98
+ B1_coeff = -2.0 * np.cos(2.0 * cf * np.pi * T) / expBT
99
+ B2_coeff = np.exp(-2.0 * B * T)
100
+
101
+ b1 = np.sin(2.0 * cf * np.pi * T) * T
102
+ bPos = b1 * 2.0 * np.sqrt(3.0 + 2.0 ** 1.5)
103
+ bNeg = b1 * 2.0 * np.sqrt(3.0 - 2.0 ** 1.5)
104
+ a = np.cos(2.0 * cf * np.pi * T) * 2.0 * T
105
+
106
+ A11 = -(a / expBT + bPos / expBT) / 2.0
107
+ A12 = -(a / expBT - bPos / expBT) / 2.0
108
+ A13 = -(a / expBT + bNeg / expBT) / 2.0
109
+ A14 = -(a / expBT - bNeg / expBT) / 2.0
110
+
111
+ # Gain calculation (complex arithmetic)
112
+ p1 = 2.0 ** (3.0 / 2.0)
113
+ s1 = np.sqrt(3.0 - p1)
114
+ s2 = np.sqrt(3.0 + p1)
115
+
116
+ # Complex exponentials
117
+ xExp = np.exp(4.0j * cf * np.pi * T)
118
+ x01 = -2.0 * xExp * T
119
+ x02 = 2.0 * np.exp((-B + 2.0j * cf * np.pi) * T) * T
120
+
121
+ xCos = np.cos(2.0 * cf * np.pi * T)
122
+ xSin = np.sin(2.0 * cf * np.pi * T)
123
+
124
+ x1 = x01 + x02 * (xCos - s1 * xSin)
125
+ x2 = x01 + x02 * (xCos + s1 * xSin)
126
+ x3 = x01 + x02 * (xCos - s2 * xSin)
127
+ x4 = x01 + x02 * (xCos + s2 * xSin)
128
+
129
+ x5 = (-2.0 / np.exp(2.0 * B * T)
130
+ - 2.0 * xExp
131
+ + 2.0 * (1.0 + xExp) / np.exp(B * T))
132
+
133
+ gain = np.abs(x1 * x2 * x3 * x4 / (x5 ** 4))
134
+
135
+ # Assemble coefficient matrix (10 rows x num_channels columns)
136
+ A0 = np.full(num_channels, T)
137
+ A2 = np.zeros(num_channels)
138
+ B0 = np.ones(num_channels)
139
+
140
+ filter_coeffs = np.array([
141
+ A0, # 0: A0
142
+ A11, # 1: A11
143
+ A12, # 2: A12
144
+ A13, # 3: A13
145
+ A14, # 4: A14
146
+ A2, # 5: A2
147
+ B0, # 6: B0
148
+ B1_coeff, # 7: B1
149
+ B2_coeff, # 8: B2
150
+ gain, # 9: gain
151
+ ])
152
+
153
+ return ErbFiltersResult(center_freqs=cf, filter_coeffs=filter_coeffs)
154
+
155
+
156
+ def _iir_filter(b: np.ndarray, a: np.ndarray, signal: np.ndarray,
157
+ zi: np.ndarray) -> tuple:
158
+ """
159
+ Apply IIR filter (Direct Form II transposed), matching C++ SignalFilter::Filter.
160
+
161
+ This uses scipy.signal.lfilter which implements exactly the same
162
+ Direct Form II transposed difference equations.
163
+
164
+ Args:
165
+ b: Numerator coefficients [b0, b1, b2].
166
+ a: Denominator coefficients [a0, a1, a2].
167
+ signal: Input signal.
168
+ zi: Initial filter conditions (length = max(len(a), len(b)) - 1).
169
+
170
+ Returns:
171
+ Tuple of (filtered_signal, final_conditions).
172
+ """
173
+ y, zf = lfilter(b, a, signal, zi=zi)
174
+ return y, zf
175
+
176
+
177
+ class GammatoneFilterBank:
178
+ """
179
+ Gammatone filterbank that applies 4-stage cascaded IIR filtering.
180
+ Each stage uses different A coefficients but the same B (denominator) coefficients.
181
+ """
182
+
183
+ def __init__(self, num_bands: int, min_freq: float):
184
+ self.num_bands = num_bands
185
+ self.min_freq = min_freq
186
+ # Filter conditions for 4 stages, each (num_bands, 2) shaped
187
+ self._conditions = None
188
+
189
+ def reset_conditions(self):
190
+ """Reset all filter conditions to zero."""
191
+ self._conditions = [
192
+ [np.zeros(2) for _ in range(self.num_bands)]
193
+ for _ in range(4)
194
+ ]
195
+
196
+ def apply_filter(self, signal: np.ndarray,
197
+ filter_coeffs: np.ndarray) -> np.ndarray:
198
+ """
199
+ Apply 4-stage cascaded gammatone filter to signal for all bands.
200
+
201
+ The filter_coeffs matrix has been flipped updown, so rows correspond
202
+ to bands in reversed order. Columns:
203
+ [A0, A11, A12, A13, A14, A2, B0, B1, B2, gain]
204
+
205
+ Args:
206
+ signal: Input signal frame (1D array).
207
+ filter_coeffs: (10, num_bands) coefficient matrix.
208
+
209
+ Returns:
210
+ (num_bands, len(signal)) filtered output matrix.
211
+ """
212
+ output = np.zeros((self.num_bands, len(signal)))
213
+
214
+ # Extract coefficient vectors
215
+ A0 = filter_coeffs[0] # (num_bands,)
216
+ A11 = filter_coeffs[1]
217
+ A12 = filter_coeffs[2]
218
+ A13 = filter_coeffs[3]
219
+ A14 = filter_coeffs[4]
220
+ A2 = filter_coeffs[5]
221
+ B0 = filter_coeffs[6]
222
+ B1 = filter_coeffs[7]
223
+ B2 = filter_coeffs[8]
224
+ gain = filter_coeffs[9]
225
+
226
+ for chan in range(self.num_bands):
227
+ # Stage 1: normalize by gain
228
+ a1_b = np.array([A0[chan] / gain[chan],
229
+ A11[chan] / gain[chan],
230
+ A2[chan] / gain[chan]])
231
+ # Stage 2
232
+ a2_b = np.array([A0[chan], A12[chan], A2[chan]])
233
+ # Stage 3
234
+ a3_b = np.array([A0[chan], A13[chan], A2[chan]])
235
+ # Stage 4
236
+ a4_b = np.array([A0[chan], A14[chan], A2[chan]])
237
+
238
+ # Denominator is the same for all 4 stages
239
+ denom = np.array([B0[chan], B1[chan], B2[chan]])
240
+
241
+ # 4-stage cascade
242
+ y, zf = lfilter(a1_b, denom, signal,
243
+ zi=self._conditions[0][chan])
244
+ self._conditions[0][chan] = zf
245
+
246
+ y, zf = lfilter(a2_b, denom, y,
247
+ zi=self._conditions[1][chan])
248
+ self._conditions[1][chan] = zf
249
+
250
+ y, zf = lfilter(a3_b, denom, y,
251
+ zi=self._conditions[2][chan])
252
+ self._conditions[2][chan] = zf
253
+
254
+ y, zf = lfilter(a4_b, denom, y,
255
+ zi=self._conditions[3][chan])
256
+ self._conditions[3][chan] = zf
257
+
258
+ output[chan] = y
259
+
260
+ return output
261
+
262
+
263
+ class Spectrogram:
264
+ """
265
+ Spectrogram data container with dB conversion and noise floor processing.
266
+ """
267
+
268
+ def __init__(self, data: np.ndarray,
269
+ center_freq_bands: np.ndarray = None):
270
+ """
271
+ Args:
272
+ data: (num_bands, num_frames) spectrogram matrix.
273
+ center_freq_bands: Center frequencies for each band (low to high).
274
+ """
275
+ self.data = np.asarray(data, dtype=np.float64)
276
+ self.center_freq_bands = center_freq_bands if center_freq_bands is not None else np.array([])
277
+
278
+ @property
279
+ def num_bands(self) -> int:
280
+ return self.data.shape[0]
281
+
282
+ @property
283
+ def num_frames(self) -> int:
284
+ return self.data.shape[1]
285
+
286
+
287
+ def convert_to_db(matrix: np.ndarray) -> np.ndarray:
288
+ """
289
+ Convert spectrogram values to decibels: 10 * log10(|x|).
290
+ Zero values are replaced with machine epsilon.
291
+ Matches C++ Spectrogram::ConvertSampleToDb.
292
+ """
293
+ abs_matrix = np.abs(matrix)
294
+ abs_matrix = np.where(abs_matrix == 0, np.finfo(np.float64).eps, abs_matrix)
295
+ return 10.0 * np.log10(abs_matrix)
296
+
297
+
298
+ def prepare_spectrograms_for_comparison(
299
+ ref_spec: Spectrogram, deg_spec: Spectrogram
300
+ ) -> tuple:
301
+ """
302
+ Prepare reference and degraded spectrograms for comparison.
303
+ 1. Convert to dB
304
+ 2. Apply absolute noise floor (-45 dB)
305
+ 3. Apply per-frame relative noise floor (peak - 45 dB)
306
+ 4. Normalize to 0 dB global floor
307
+
308
+ Matches C++ MiscAudio::PrepareSpectrogramsForComparison.
309
+
310
+ Returns:
311
+ Tuple of (ref_db, deg_db) as numpy arrays.
312
+ """
313
+ NOISE_FLOOR_ABSOLUTE_DB = -45.0
314
+ NOISE_FLOOR_RELATIVE_TO_PEAK_DB = 45.0
315
+
316
+ # 1. Convert to dB
317
+ ref_db = convert_to_db(ref_spec.data)
318
+ deg_db = convert_to_db(deg_spec.data)
319
+
320
+ # 2. Absolute noise floor
321
+ ref_db = np.maximum(ref_db, NOISE_FLOOR_ABSOLUTE_DB)
322
+ deg_db = np.maximum(deg_db, NOISE_FLOOR_ABSOLUTE_DB)
323
+
324
+ # 3. Per-frame relative noise floor
325
+ min_cols = min(ref_db.shape[1], deg_db.shape[1])
326
+ for i in range(min_cols):
327
+ our_max = np.max(ref_db[:, i])
328
+ other_max = np.max(deg_db[:, i])
329
+ any_max = max(our_max, other_max)
330
+ floor_db = any_max - NOISE_FLOOR_RELATIVE_TO_PEAK_DB
331
+
332
+ ref_db[:, i] = np.maximum(ref_db[:, i], floor_db)
333
+ deg_db[:, i] = np.maximum(deg_db[:, i], floor_db)
334
+
335
+ # 4. Global normalization: subtract global minimum
336
+ lowest = min(np.min(ref_db), np.min(deg_db))
337
+ ref_db -= lowest
338
+ deg_db -= lowest
339
+
340
+ return ref_db, deg_db
341
+
342
+
343
+ class GammatoneSpectrogramBuilder:
344
+ """
345
+ Builds a gammatone-filtered spectrogram from an audio signal.
346
+ """
347
+
348
+ def __init__(self, num_bands: int, min_freq: float,
349
+ speech_mode: bool = False):
350
+ """
351
+ Args:
352
+ num_bands: Number of frequency bands.
353
+ min_freq: Minimum center frequency.
354
+ speech_mode: If True, cap max frequency at 8000 Hz.
355
+ """
356
+ self.filter_bank = GammatoneFilterBank(num_bands, min_freq)
357
+ self.speech_mode = speech_mode
358
+
359
+ def build(self, signal: AudioSignal,
360
+ window: AnalysisWindow) -> Spectrogram:
361
+ """
362
+ Build a gammatone spectrogram from an audio signal.
363
+
364
+ Args:
365
+ signal: Input audio signal.
366
+ window: Analysis window parameters.
367
+
368
+ Returns:
369
+ Spectrogram object.
370
+
371
+ Raises:
372
+ ValueError: If signal is too short.
373
+ """
374
+ sig = signal.data
375
+ sample_rate = signal.sample_rate
376
+ num_bands = self.filter_bank.num_bands
377
+
378
+ max_freq = SPEECH_MODE_MAX_FREQ if self.speech_mode else sample_rate / 2.0
379
+
380
+ # Compute ERB filter coefficients
381
+ erb_result = make_erb_filters(
382
+ sample_rate, num_bands, self.filter_bank.min_freq, max_freq
383
+ )
384
+ # Flip updown (reverse row order) to match C++
385
+ filter_coeffs = erb_result.filter_coeffs[:, ::-1]
386
+
387
+ # Setup windowing
388
+ hop_size = int(window.size * window.overlap)
389
+
390
+ if len(sig) <= window.size:
391
+ raise ValueError(
392
+ f"Too few samples ({len(sig)}) to build spectrogram "
393
+ f"({window.size} required minimum)."
394
+ )
395
+
396
+ num_cols = 1 + int(np.floor((len(sig) - window.size) / hop_size))
397
+ out_matrix = np.zeros((num_bands, num_cols))
398
+
399
+ for i in range(num_cols):
400
+ start = i * hop_size
401
+ frame = sig[start:start + window.size].copy()
402
+
403
+ # Apply Hann window
404
+ windowed_frame = window.apply_hann_window(frame)
405
+
406
+ # Reset filter conditions for each frame
407
+ self.filter_bank.reset_conditions()
408
+
409
+ # Apply gammatone filter bank
410
+ filtered = self.filter_bank.apply_filter(windowed_frame, filter_coeffs)
411
+
412
+ # RMS per band: sqrt(mean(filtered^2))
413
+ out_matrix[:, i] = np.sqrt(np.mean(filtered ** 2, axis=1))
414
+
415
+ # Order center frequencies from lowest to highest (reverse the ERB order)
416
+ ordered_cfs = erb_result.center_freqs[::-1].copy()
417
+
418
+ return Spectrogram(out_matrix, center_freq_bands=ordered_cfs)