python-peass 2.0.1__py3-none-any.whl → 2.0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- peass/__init__.py +22 -10
- peass/auditory_model.py +318 -112
- peass/config.py +73 -0
- peass/decomposition.py +558 -339
- peass/gammatone.py +481 -145
- peass/metrics.py +196 -120
- peass/predictor.py +116 -99
- python_peass-2.0.1.2.dist-info/METADATA +206 -0
- python_peass-2.0.1.2.dist-info/RECORD +15 -0
- python_peass-2.0.1.dist-info/METADATA +0 -165
- python_peass-2.0.1.dist-info/RECORD +0 -14
- {python_peass-2.0.1.dist-info → python_peass-2.0.1.2.dist-info}/WHEEL +0 -0
- {python_peass-2.0.1.dist-info → python_peass-2.0.1.2.dist-info}/licenses/LICENSE +0 -0
peass/__init__.py
CHANGED
|
@@ -1,18 +1,30 @@
|
|
|
1
1
|
"""
|
|
2
2
|
python-peass: Perceptual Evaluation methods for Audio Source Separation
|
|
3
|
-
A modern, Pythonic port of the PEASS v2.0.1 toolkit
|
|
3
|
+
A modern, Pythonic port of the PEASS v2.0.1 toolkit.
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
|
-
__version__ = "2.0.1"
|
|
6
|
+
__version__ = "2.0.1.2" # matches peass version, with one more segment for me to edit
|
|
7
7
|
|
|
8
|
-
from .
|
|
9
|
-
from .
|
|
10
|
-
from .
|
|
11
|
-
from .
|
|
8
|
+
from .config import DecomposedFilePaths
|
|
9
|
+
from .config import DecomposedWaveforms
|
|
10
|
+
from .config import DecompositionConfiguration
|
|
11
|
+
from .config import DecompositionResult
|
|
12
|
+
from .config import ModulationProcessingType
|
|
13
|
+
from .config import PerceptualSeparationScores
|
|
14
|
+
from .decomposition import decompose_distortion_components
|
|
15
|
+
from .metrics import calculate_auditory_quality_features
|
|
16
|
+
from .metrics import calculate_bss_eval_energy_ratios
|
|
17
|
+
from .predictor import predict_perceptual_evaluation_scores
|
|
12
18
|
|
|
13
19
|
__all__ = [
|
|
14
|
-
"
|
|
15
|
-
"
|
|
16
|
-
"
|
|
17
|
-
"
|
|
20
|
+
"DecomposedFilePaths",
|
|
21
|
+
"DecomposedWaveforms",
|
|
22
|
+
"DecompositionConfiguration",
|
|
23
|
+
"DecompositionResult",
|
|
24
|
+
"ModulationProcessingType",
|
|
25
|
+
"PerceptualSeparationScores",
|
|
26
|
+
"predict_perceptual_evaluation_scores",
|
|
27
|
+
"decompose_distortion_components",
|
|
28
|
+
"calculate_bss_eval_energy_ratios",
|
|
29
|
+
"calculate_auditory_quality_features",
|
|
18
30
|
]
|
peass/auditory_model.py
CHANGED
|
@@ -1,158 +1,364 @@
|
|
|
1
1
|
"""
|
|
2
|
-
PEASS Auditory Package - Dau 1996/1997 Psychoacoustic Ear Model
|
|
2
|
+
PEASS Auditory Package - Dau 1996/1997 Psychoacoustic Ear Model
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
and
|
|
4
|
+
Simulates the transduction process of the inner hair cells and the temporal
|
|
5
|
+
adaptation (forward masking) of the auditory nerve. Uses Numba if available,
|
|
6
|
+
and fails over gracefully to a SciPy/NumPy native vectorization.
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
+
import math
|
|
9
10
|
from typing import Tuple
|
|
10
11
|
|
|
11
12
|
import numpy as np
|
|
12
13
|
import scipy.signal as signal
|
|
13
14
|
|
|
15
|
+
from .config import ModulationProcessingType
|
|
14
16
|
from .gammatone import GammatoneAnalyzer
|
|
17
|
+
from .gammatone import fast_resample_poly
|
|
15
18
|
|
|
16
|
-
#
|
|
19
|
+
# -----------------------------------------------------------------------------
|
|
20
|
+
# NUMBA JIT COMPILATION (WITH SAFE IMPORT FALLBACK)
|
|
21
|
+
# -----------------------------------------------------------------------------
|
|
17
22
|
try:
|
|
18
23
|
import numba
|
|
19
24
|
|
|
20
25
|
_HAS_NUMBA = True
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@numba.njit(cache=True)
|
|
29
|
+
def _numba_fused_auditory_kernel(
|
|
30
|
+
subband_signals: np.ndarray,
|
|
31
|
+
sampling_frequency_hz: float,
|
|
32
|
+
haircell_filter_gain: float,
|
|
33
|
+
adaptation_bandwidths: np.ndarray,
|
|
34
|
+
absolute_hearing_threshold: float
|
|
35
|
+
) -> np.ndarray:
|
|
36
|
+
"""
|
|
37
|
+
Fused JIT kernel: Half-wave rectification, haircell lowpass,
|
|
38
|
+
and 5-stage non-linear adaptation executing natively in a single pass.
|
|
39
|
+
"""
|
|
40
|
+
num_bands, num_samples = subband_signals.shape
|
|
41
|
+
output_signals = np.empty_like(subband_signals)
|
|
42
|
+
|
|
43
|
+
stage_thresholds = np.empty(5, dtype=np.float64)
|
|
44
|
+
stage_gains = np.empty(5, dtype=np.float64)
|
|
45
|
+
|
|
46
|
+
current_threshold = absolute_hearing_threshold
|
|
47
|
+
for stage_idx in range(5):
|
|
48
|
+
current_threshold = math.sqrt(current_threshold)
|
|
49
|
+
stage_thresholds[stage_idx] = current_threshold
|
|
50
|
+
stage_gains[stage_idx] = math.exp(-math.pi * adaptation_bandwidths[stage_idx] / sampling_frequency_hz)
|
|
51
|
+
|
|
52
|
+
haircell_factor = 1.0 - haircell_filter_gain
|
|
53
|
+
|
|
54
|
+
adaptation_factors = np.empty_like(stage_thresholds)
|
|
55
|
+
for band_idx in range(num_bands):
|
|
56
|
+
last_haircell_state = 0.0
|
|
57
|
+
for stage_idx in range(5):
|
|
58
|
+
adaptation_factors[stage_idx] = stage_thresholds[stage_idx]
|
|
59
|
+
for sample_idx in range(num_samples):
|
|
60
|
+
# 1. Half-wave rectification
|
|
61
|
+
current_value = subband_signals[band_idx, sample_idx]
|
|
62
|
+
if current_value < 0.0:
|
|
63
|
+
current_value = 0.0
|
|
64
|
+
|
|
65
|
+
# 2. 1 kHz first-order lowpass filter (haircell transduction)
|
|
66
|
+
current_value = haircell_filter_gain * last_haircell_state + haircell_factor * current_value
|
|
67
|
+
last_haircell_state = current_value
|
|
68
|
+
|
|
69
|
+
# Minimum hearing threshold floor
|
|
70
|
+
if current_value < absolute_hearing_threshold:
|
|
71
|
+
current_value = absolute_hearing_threshold
|
|
72
|
+
|
|
73
|
+
# 3. Unrolled 5-stage non-linear adaptation loops
|
|
74
|
+
for stage_idx in range(5):
|
|
75
|
+
gain_value = stage_gains[stage_idx]
|
|
76
|
+
threshold_value = stage_thresholds[stage_idx]
|
|
77
|
+
active_factor = adaptation_factors[stage_idx]
|
|
78
|
+
|
|
79
|
+
compressed_value = current_value / active_factor
|
|
80
|
+
|
|
81
|
+
adaptation_factors[stage_idx] = max(
|
|
82
|
+
(1.0 - gain_value) * compressed_value + gain_value * active_factor,
|
|
83
|
+
threshold_value
|
|
84
|
+
)
|
|
85
|
+
current_value = compressed_value
|
|
86
|
+
|
|
87
|
+
output_signals[band_idx, sample_idx] = current_value
|
|
88
|
+
|
|
89
|
+
return output_signals
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
@numba.njit(cache=True)
|
|
93
|
+
def _numba_haircell_transduction_kernel(
|
|
94
|
+
subband_signals: np.ndarray,
|
|
95
|
+
sampling_frequency_hz: float
|
|
96
|
+
) -> np.ndarray:
|
|
97
|
+
"""
|
|
98
|
+
Dedicated JIT-compiled kernel for standalone haircell transduction.
|
|
99
|
+
"""
|
|
100
|
+
num_bands, num_samples = subband_signals.shape
|
|
101
|
+
output_signals = np.empty_like(subband_signals)
|
|
102
|
+
haircell_filter_gain = math.exp(-math.pi * 2000.0 / sampling_frequency_hz)
|
|
103
|
+
haircell_factor = 1.0 - haircell_filter_gain
|
|
104
|
+
|
|
105
|
+
for band_idx in range(num_bands):
|
|
106
|
+
last_haircell_state = 0.0
|
|
107
|
+
for sample_idx in range(num_samples):
|
|
108
|
+
current_value = subband_signals[band_idx, sample_idx]
|
|
109
|
+
if current_value < 0.0:
|
|
110
|
+
current_value = 0.0
|
|
111
|
+
current_value = haircell_filter_gain * last_haircell_state + haircell_factor * current_value
|
|
112
|
+
last_haircell_state = current_value
|
|
113
|
+
output_signals[band_idx, sample_idx] = current_value
|
|
114
|
+
return output_signals
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
@numba.njit(cache=True)
|
|
118
|
+
def _numba_adaptation_loops_kernel(
|
|
119
|
+
subband_signals: np.ndarray,
|
|
120
|
+
sampling_frequency_hz: float,
|
|
121
|
+
adaptation_bandwidths: np.ndarray,
|
|
122
|
+
absolute_hearing_threshold: float
|
|
123
|
+
) -> np.ndarray:
|
|
124
|
+
"""
|
|
125
|
+
Dedicated JIT-compiled kernel for standalone adaptation loops.
|
|
126
|
+
"""
|
|
127
|
+
num_bands, num_samples = subband_signals.shape
|
|
128
|
+
output_signals = np.empty_like(subband_signals)
|
|
129
|
+
|
|
130
|
+
stage_thresholds = np.empty(5, dtype=np.float64)
|
|
131
|
+
stage_gains = np.empty(5, dtype=np.float64)
|
|
132
|
+
|
|
133
|
+
current_threshold = absolute_hearing_threshold
|
|
134
|
+
for stage_idx in range(5):
|
|
135
|
+
current_threshold = math.sqrt(current_threshold)
|
|
136
|
+
stage_thresholds[stage_idx] = current_threshold
|
|
137
|
+
stage_gains[stage_idx] = math.exp(-math.pi * adaptation_bandwidths[stage_idx] / sampling_frequency_hz)
|
|
138
|
+
|
|
139
|
+
adaptation_factors = np.empty_like(stage_thresholds)
|
|
140
|
+
for band_idx in range(num_bands):
|
|
141
|
+
for stage_idx in range(5):
|
|
142
|
+
adaptation_factors[stage_idx] = stage_thresholds[stage_idx]
|
|
143
|
+
for sample_idx in range(num_samples):
|
|
144
|
+
current_value = subband_signals[band_idx, sample_idx]
|
|
145
|
+
if current_value < absolute_hearing_threshold:
|
|
146
|
+
current_value = absolute_hearing_threshold
|
|
147
|
+
|
|
148
|
+
for stage_idx in range(5):
|
|
149
|
+
gain_value = stage_gains[stage_idx]
|
|
150
|
+
threshold_value = stage_thresholds[stage_idx]
|
|
151
|
+
active_factor = adaptation_factors[stage_idx]
|
|
152
|
+
compressed_value = current_value / active_factor
|
|
153
|
+
|
|
154
|
+
adaptation_factors[stage_idx] = max(
|
|
155
|
+
(1.0 - gain_value) * compressed_value + gain_value * active_factor,
|
|
156
|
+
threshold_value
|
|
157
|
+
)
|
|
158
|
+
current_value = compressed_value
|
|
159
|
+
|
|
160
|
+
output_signals[band_idx, sample_idx] = current_value
|
|
161
|
+
|
|
162
|
+
return output_signals
|
|
163
|
+
|
|
21
164
|
except ImportError:
|
|
22
165
|
_HAS_NUMBA = False
|
|
23
166
|
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
167
|
+
|
|
168
|
+
# -----------------------------------------------------------------------------
|
|
169
|
+
# PURE PYTHON/SCIPY FALLBACKS
|
|
170
|
+
# -----------------------------------------------------------------------------
|
|
171
|
+
def _fallback_adaptation_loops(
|
|
172
|
+
subband_signals: np.ndarray,
|
|
173
|
+
sampling_frequency_hz: float,
|
|
174
|
+
adaptation_bandwidths: np.ndarray,
|
|
175
|
+
absolute_hearing_threshold: float
|
|
176
|
+
) -> np.ndarray:
|
|
177
|
+
"""
|
|
178
|
+
Pure NumPy fallback for the nonlinear adaptation loops.
|
|
179
|
+
Vectorizes across the frequency bands to mitigate Python loop overhead.
|
|
180
|
+
"""
|
|
181
|
+
num_samples = subband_signals.shape[1]
|
|
182
|
+
adapted_signals = np.maximum(subband_signals, absolute_hearing_threshold)
|
|
183
|
+
stage_threshold = absolute_hearing_threshold
|
|
184
|
+
|
|
185
|
+
for stage_idx in range(5):
|
|
186
|
+
adaptation_gain = math.exp(-math.pi * adaptation_bandwidths[stage_idx] / sampling_frequency_hz)
|
|
187
|
+
stage_threshold = math.sqrt(stage_threshold)
|
|
188
|
+
divisor_factors = np.full(subband_signals.shape[0], stage_threshold, dtype=np.float64)
|
|
189
|
+
|
|
28
190
|
for sample_idx in range(num_samples):
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
rx[band_idx, sample_idx] = val
|
|
32
|
-
factor[band_idx] = max((1.0 - gain_val) * val + gain_val * factor[band_idx], sthresh)
|
|
33
|
-
return rx
|
|
191
|
+
current_values = adapted_signals[:, sample_idx] / divisor_factors
|
|
192
|
+
adapted_signals[:, sample_idx] = current_values
|
|
34
193
|
|
|
194
|
+
divisor_factors = np.maximum(
|
|
195
|
+
(1.0 - adaptation_gain) * current_values + adaptation_gain * divisor_factors,
|
|
196
|
+
stage_threshold
|
|
197
|
+
)
|
|
35
198
|
|
|
36
|
-
|
|
37
|
-
"""
|
|
38
|
-
Models the nonlinear mechanical-to-neural transduction of the inner hair cells.
|
|
39
|
-
Replaces haircell.c MEX script [2, 3].
|
|
199
|
+
return adapted_signals
|
|
40
200
|
|
|
41
|
-
Stages:
|
|
42
|
-
1. Half-wave rectification (simulates unidirectional shearing of hair bundle)
|
|
43
|
-
2. 1 kHz first-order lowpass filter (simulates inner hair cell membrane limits)
|
|
44
|
-
"""
|
|
45
|
-
# % gain=exp(-pi*2000/fs);
|
|
46
|
-
# % rx=filter(1-gain,[1 -gain],max(rx,0),[],2);
|
|
47
|
-
gain_haircell = np.exp(-np.pi * 2000.0 / sampling_frequency)
|
|
48
|
-
b_hc = np.array([1.0 - gain_haircell])
|
|
49
|
-
a_hc = np.array([1.0, -gain_haircell])
|
|
50
201
|
|
|
51
|
-
|
|
202
|
+
def _fallback_fused_auditory_kernel(
|
|
203
|
+
subband_signals: np.ndarray,
|
|
204
|
+
sampling_frequency_hz: float,
|
|
205
|
+
haircell_filter_gain: float,
|
|
206
|
+
adaptation_bandwidths: np.ndarray,
|
|
207
|
+
absolute_hearing_threshold: float
|
|
208
|
+
) -> np.ndarray:
|
|
209
|
+
"""
|
|
210
|
+
Pure SciPy/NumPy fallback executing identical math utilizing C-backends.
|
|
211
|
+
"""
|
|
212
|
+
# 1. Half-wave rectification
|
|
52
213
|
rectified_signals = np.maximum(subband_signals, 0.0)
|
|
53
|
-
return signal.lfilter(b_hc, a_hc, rectified_signals, axis=1)
|
|
54
214
|
|
|
215
|
+
# 2. Haircell 1 kHz first-order lowpass filter
|
|
216
|
+
numerator_coefficients = np.array([1.0 - haircell_filter_gain])
|
|
217
|
+
denominator_coefficients = np.array([1.0, -haircell_filter_gain])
|
|
218
|
+
transduced_signals = signal.lfilter(numerator_coefficients, denominator_coefficients, rectified_signals, axis=-1)
|
|
55
219
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
Replaces adapt.c MEX script [2].
|
|
220
|
+
return _fallback_adaptation_loops(
|
|
221
|
+
transduced_signals, sampling_frequency_hz, adaptation_bandwidths, absolute_hearing_threshold
|
|
222
|
+
)
|
|
60
223
|
|
|
61
|
-
Runs 5 consecutive non-linear feedback loops modeling forward masking,
|
|
62
|
-
vectorized across all bands for optimal execution in Python.
|
|
63
|
-
"""
|
|
64
|
-
dbrange = 100.0
|
|
65
|
-
thresh = 10.0 ** (-dbrange / 20.0)
|
|
66
|
-
bw_loop = 1.0 / (np.pi * np.array([0.005, 0.05, 0.129, 0.253, 0.5]))
|
|
67
224
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
225
|
+
# -----------------------------------------------------------------------------
|
|
226
|
+
# EXPOSED API (STRICT PEP-484 TYPING)
|
|
227
|
+
# -----------------------------------------------------------------------------
|
|
228
|
+
def simulate_inner_haircell_transduction(
|
|
229
|
+
subband_signals: np.ndarray,
|
|
230
|
+
sampling_frequency_hz: float
|
|
231
|
+
) -> np.ndarray:
|
|
232
|
+
"""Models the nonlinear mechanical-to-neural transduction of the inner hair cells."""
|
|
233
|
+
if _HAS_NUMBA:
|
|
234
|
+
return _numba_haircell_transduction_kernel(subband_signals, sampling_frequency_hz)
|
|
235
|
+
else:
|
|
236
|
+
rectified_signals = np.maximum(subband_signals, 0.0)
|
|
237
|
+
haircell_filter_gain = math.exp(-math.pi * 2000.0 / sampling_frequency_hz)
|
|
238
|
+
numerator_coefficients = np.array([1.0 - haircell_filter_gain])
|
|
239
|
+
denominator_coefficients = np.array([1.0, -haircell_filter_gain])
|
|
240
|
+
return signal.lfilter(numerator_coefficients, denominator_coefficients, rectified_signals, axis=-1)
|
|
71
241
|
|
|
72
|
-
# Process each of the 5 adaptive stages
|
|
73
|
-
sthresh = thresh
|
|
74
|
-
for stage_idx in range(5):
|
|
75
|
-
gain_val = np.exp(-np.pi * bw_loop[stage_idx] / sampling_frequency)
|
|
76
|
-
sthresh = np.sqrt(sthresh)
|
|
77
|
-
factor = np.full(num_bands, sthresh, dtype=np.float32) # divisor factor for each band
|
|
78
|
-
|
|
79
|
-
if _HAS_NUMBA:
|
|
80
|
-
# Compiled loop executing at native C speeds
|
|
81
|
-
rx = _numba_adaptation_loop(rx, float(gain_val), float(sthresh), factor)
|
|
82
|
-
else:
|
|
83
|
-
# Fallback pure-Python loop
|
|
84
|
-
for sample_idx in range(num_samples):
|
|
85
|
-
# Divide current sample by current divisor factor
|
|
86
|
-
val = rx[:, sample_idx] / factor
|
|
87
|
-
rx[:, sample_idx] = val
|
|
88
|
-
# Update divisor filter state
|
|
89
|
-
factor = np.maximum((1.0 - gain_val) * val + gain_val * factor, sthresh)
|
|
90
242
|
|
|
91
|
-
|
|
92
|
-
|
|
243
|
+
def simulate_auditory_nerve_adaptation(
|
|
244
|
+
subband_signals: np.ndarray,
|
|
245
|
+
sampling_frequency_hz: float
|
|
246
|
+
) -> np.ndarray:
|
|
247
|
+
"""Simulates the physiological adaptive properties of the auditory nerve."""
|
|
248
|
+
decibel_range = 100.0
|
|
249
|
+
absolute_hearing_threshold = 10.0 ** (-decibel_range / 20.0)
|
|
250
|
+
adaptation_loop_bandwidths = 1.0 / (np.pi * np.array([0.005, 0.05, 0.129, 0.253, 0.5]))
|
|
251
|
+
|
|
252
|
+
if _HAS_NUMBA:
|
|
253
|
+
adapted_signals = _numba_adaptation_loops_kernel(
|
|
254
|
+
subband_signals, sampling_frequency_hz, adaptation_loop_bandwidths, absolute_hearing_threshold
|
|
255
|
+
)
|
|
256
|
+
else:
|
|
257
|
+
adapted_signals = _fallback_adaptation_loops(
|
|
258
|
+
subband_signals, sampling_frequency_hz, adaptation_loop_bandwidths, absolute_hearing_threshold
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
final_threshold = absolute_hearing_threshold
|
|
262
|
+
for _ in range(5):
|
|
263
|
+
final_threshold = math.sqrt(final_threshold)
|
|
264
|
+
|
|
265
|
+
return (decibel_range / (1.0 - final_threshold)) * (adapted_signals - final_threshold)
|
|
93
266
|
|
|
94
267
|
|
|
95
|
-
def
|
|
268
|
+
def generate_auditory_internal_representation(
|
|
96
269
|
signal_data: np.ndarray,
|
|
97
|
-
|
|
98
|
-
modulation_processing_type:
|
|
270
|
+
sampling_frequency_hz: float,
|
|
271
|
+
modulation_processing_type: ModulationProcessingType = ModulationProcessingType.LOWPASS
|
|
99
272
|
) -> Tuple[np.ndarray, float]:
|
|
100
|
-
"""
|
|
101
|
-
Generates the 3D internal auditory representation of a signal.
|
|
102
|
-
Equivalent of pemo_internal.m [1].
|
|
103
|
-
"""
|
|
273
|
+
"""Generates the 3D internal auditory representation of a signal."""
|
|
104
274
|
if len(signal_data.shape) > 1:
|
|
105
275
|
if signal_data.shape[0] < signal_data.shape[1]:
|
|
106
276
|
signal_data = signal_data.T
|
|
107
277
|
signal_data = signal_data.ravel()
|
|
108
278
|
|
|
109
279
|
# Model input scaling (1.0 becomes 100 dB SPL)
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
#
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
280
|
+
scaled_signal_data = 10.0 * signal_data
|
|
281
|
+
|
|
282
|
+
minimum_frequency = 235.0
|
|
283
|
+
maximum_frequency = min(0.5 * sampling_frequency_hz, 14500.0)
|
|
284
|
+
|
|
285
|
+
# Decimate using polyphase FIR (avoids global FFT memory spikes)
|
|
286
|
+
if sampling_frequency_hz < 3.0 * maximum_frequency:
|
|
287
|
+
new_sampling_frequency = int(round(1.5 * sampling_frequency_hz))
|
|
288
|
+
# scaled_signal_data = signal.resample_poly(scaled_signal_data, new_sampling_frequency, int(sampling_frequency_hz))
|
|
289
|
+
scaled_signal_data = fast_resample_poly(
|
|
290
|
+
scaled_signal_data, new_sampling_frequency, int(sampling_frequency_hz)
|
|
291
|
+
)
|
|
292
|
+
sampling_frequency_hz = float(new_sampling_frequency)
|
|
293
|
+
|
|
294
|
+
# 1. Gammatone Analysis Filterbank
|
|
295
|
+
analyzer = GammatoneAnalyzer(sampling_frequency_hz, minimum_frequency, 1000.0, maximum_frequency, 1.0)
|
|
296
|
+
subbands = np.real(analyzer.process(scaled_signal_data))
|
|
297
|
+
|
|
298
|
+
# 2 & 3. Fused IHC Transduction and Nerve Adaptation
|
|
299
|
+
haircell_filter_gain = math.exp(-math.pi * 2000.0 / sampling_frequency_hz)
|
|
300
|
+
decibel_range = 100.0
|
|
301
|
+
absolute_hearing_threshold = 10.0 ** (-decibel_range / 20.0)
|
|
302
|
+
adaptation_loop_bandwidths = 1.0 / (np.pi * np.array([0.005, 0.05, 0.129, 0.253, 0.5]))
|
|
303
|
+
|
|
304
|
+
if _HAS_NUMBA:
|
|
305
|
+
adapted_signals = _numba_fused_auditory_kernel(
|
|
306
|
+
subbands, sampling_frequency_hz, haircell_filter_gain,
|
|
307
|
+
adaptation_loop_bandwidths, absolute_hearing_threshold
|
|
308
|
+
)
|
|
137
309
|
else:
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
310
|
+
adapted_signals = _fallback_fused_auditory_kernel(
|
|
311
|
+
subbands, sampling_frequency_hz, haircell_filter_gain,
|
|
312
|
+
adaptation_loop_bandwidths, absolute_hearing_threshold
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
# Global dB offset scaling
|
|
316
|
+
final_threshold = absolute_hearing_threshold
|
|
317
|
+
for _ in range(5):
|
|
318
|
+
final_threshold = math.sqrt(final_threshold)
|
|
319
|
+
|
|
320
|
+
adapted_signals = (decibel_range / (1.0 - final_threshold)) * (adapted_signals - final_threshold)
|
|
321
|
+
|
|
322
|
+
# 4. Modulation Filtering & Polyphase Decimation
|
|
323
|
+
if modulation_processing_type == ModulationProcessingType.FILTERBANK:
|
|
324
|
+
# downsampled_adapted = signal.resample_poly(adapted_signals, 800, int(sampling_frequency_hz), axis=-1)
|
|
325
|
+
downsampled_adapted = fast_resample_poly(adapted_signals, 800, int(sampling_frequency_hz), axis=-1)
|
|
326
|
+
sampling_frequency_hz = 800.0
|
|
327
|
+
modulation_center_frequencies = np.concatenate(([0.0, 5.0], 10.0 * (5.0 / 3.0) ** np.arange(6)))
|
|
328
|
+
modulation_bandwidths = np.concatenate(([5.0, 5.0], 5.0 * (5.0 / 3.0) ** np.arange(6)))
|
|
329
|
+
else:
|
|
330
|
+
# downsampled_adapted = signal.resample_poly(adapted_signals, 100, int(sampling_frequency_hz), axis=-1)
|
|
331
|
+
downsampled_adapted = fast_resample_poly(adapted_signals, 100, int(sampling_frequency_hz), axis=-1)
|
|
332
|
+
sampling_frequency_hz = 100.0
|
|
333
|
+
modulation_center_frequencies = np.array([0.0])
|
|
334
|
+
modulation_bandwidths = np.array([15.92])
|
|
335
|
+
|
|
336
|
+
num_bands = adapted_signals.shape[0]
|
|
337
|
+
num_modulations = len(modulation_center_frequencies)
|
|
338
|
+
num_samples = downsampled_adapted.shape[1]
|
|
142
339
|
|
|
143
|
-
num_modulations = len(center_frequencies_mod)
|
|
144
|
-
num_samples = adapted.shape[1]
|
|
145
340
|
internal_representation = np.zeros((num_bands, num_samples, num_modulations), dtype=complex)
|
|
146
341
|
|
|
147
|
-
for
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
342
|
+
for mod_idx in range(num_modulations):
|
|
343
|
+
filter_gain = math.exp(-math.pi * modulation_bandwidths[mod_idx] / sampling_frequency_hz)
|
|
344
|
+
numerator_coeffs = np.array([1.0 - filter_gain])
|
|
345
|
+
denominator_coeffs = np.array([
|
|
346
|
+
1.0,
|
|
347
|
+
-filter_gain * np.exp(2j * np.pi * modulation_center_frequencies[mod_idx] / sampling_frequency_hz)
|
|
348
|
+
])
|
|
349
|
+
|
|
350
|
+
# Offloaded to SciPy C-backend
|
|
351
|
+
internal_representation[:, :, mod_idx] = signal.lfilter(
|
|
352
|
+
numerator_coeffs, denominator_coeffs, downsampled_adapted, axis=-1
|
|
353
|
+
)
|
|
152
354
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
355
|
+
channels_above_10_hz = (modulation_center_frequencies > 10.0)
|
|
356
|
+
internal_representation[:, :, ~channels_above_10_hz] = np.real(
|
|
357
|
+
internal_representation[:, :, ~channels_above_10_hz]
|
|
358
|
+
)
|
|
359
|
+
internal_representation[:, :, channels_above_10_hz] = np.abs(
|
|
360
|
+
internal_representation[:, :, channels_above_10_hz]
|
|
361
|
+
)
|
|
157
362
|
|
|
158
|
-
|
|
363
|
+
# Cast to real float64 since all imaginary parts have been discarded
|
|
364
|
+
return np.real(internal_representation), sampling_frequency_hz
|
peass/config.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PEASS Configuration and Data Structures
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from enum import Enum
|
|
8
|
+
from enum import auto
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
|
|
13
|
+
# Dynamically enable slots only on Python 3.10+
|
|
14
|
+
# TODO: drop py3.9 support and just always include slots
|
|
15
|
+
_DATACLASS_KWARGS = {"slots": True} if sys.version_info >= (3, 10) else {}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ModulationProcessingType(Enum):
|
|
19
|
+
"""Defines the type of modulation processing used in the auditory model."""
|
|
20
|
+
LOWPASS = auto()
|
|
21
|
+
FILTERBANK = auto()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass(**_DATACLASS_KWARGS)
|
|
25
|
+
class DecomposedWaveforms:
|
|
26
|
+
"""Holds the in-memory NumPy arrays for the decomposed physical components."""
|
|
27
|
+
true_target: np.ndarray
|
|
28
|
+
target_distortion: np.ndarray
|
|
29
|
+
interference: np.ndarray
|
|
30
|
+
artifacts: np.ndarray
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass(**_DATACLASS_KWARGS)
|
|
34
|
+
class DecomposedFilePaths:
|
|
35
|
+
"""Holds the absolute file paths to the generated WAV files on disk."""
|
|
36
|
+
true_target: str
|
|
37
|
+
target_distortion: str
|
|
38
|
+
interference: str
|
|
39
|
+
artifacts: str
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass(**_DATACLASS_KWARGS)
|
|
43
|
+
class DecompositionResult:
|
|
44
|
+
"""Wrapper holding both the arrays and optional file paths of a decomposition."""
|
|
45
|
+
waveforms: DecomposedWaveforms
|
|
46
|
+
file_paths: Optional[DecomposedFilePaths] = None
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass(**_DATACLASS_KWARGS)
|
|
50
|
+
class DecompositionConfiguration:
|
|
51
|
+
"""Structural configurations for the subband least-squares windowing."""
|
|
52
|
+
destination_directory: str = "./"
|
|
53
|
+
use_two_stage_projection: bool = False
|
|
54
|
+
frame_length_seconds: float = 0.5
|
|
55
|
+
filter_length_seconds: float = 0.04
|
|
56
|
+
shade_in_milliseconds: float = 10.0
|
|
57
|
+
shade_out_milliseconds: float = 10.0
|
|
58
|
+
segmentation_factor: int = 1
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass(**_DATACLASS_KWARGS)
|
|
62
|
+
class PerceptualSeparationScores:
|
|
63
|
+
"""Final assessment metrics representing the predicted subjective evaluation."""
|
|
64
|
+
overall_perceptual_score: float
|
|
65
|
+
target_perceptual_score: float
|
|
66
|
+
interference_perceptual_score: float
|
|
67
|
+
artifact_perceptual_score: float
|
|
68
|
+
source_to_distortion_ratio: float
|
|
69
|
+
source_to_spatial_distortion_ratio: float
|
|
70
|
+
source_to_interference_ratio: float
|
|
71
|
+
source_to_artifacts_ratio: float
|
|
72
|
+
decomposition_waveforms: Optional[DecomposedWaveforms] = None
|
|
73
|
+
decomposition_files: Optional[DecomposedFilePaths] = None
|