nkululeko 0.94.2__py3-none-any.whl → 0.95.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nkululeko/augmenting/resampler.py +25 -14
- nkululeko/autopredict/ap_emotion.py +36 -0
- nkululeko/autopredict/ap_text.py +45 -0
- nkululeko/autopredict/whisper_transcriber.py +81 -0
- nkululeko/constants.py +1 -1
- nkululeko/experiment.py +53 -3
- nkululeko/explore.py +32 -13
- nkululeko/feat_extract/feats_analyser.py +45 -17
- nkululeko/feat_extract/feats_emotion2vec.py +51 -26
- nkululeko/feat_extract/feinberg_praat.py +515 -372
- nkululeko/glob_conf.py +9 -0
- nkululeko/modelrunner.py +15 -6
- nkululeko/models/model_tuned.py +416 -84
- nkululeko/models/model_xgb.py +149 -3
- nkululeko/plots.py +25 -19
- nkululeko/predict.py +6 -5
- nkululeko/reporting/report.py +7 -5
- nkululeko/reporting/reporter.py +8 -5
- nkululeko/runmanager.py +1 -1
- nkululeko/utils/util.py +34 -2
- {nkululeko-0.94.2.dist-info → nkululeko-0.95.0.dist-info}/METADATA +1 -1
- {nkululeko-0.94.2.dist-info → nkululeko-0.95.0.dist-info}/RECORD +26 -23
- {nkululeko-0.94.2.dist-info → nkululeko-0.95.0.dist-info}/WHEEL +0 -0
- {nkululeko-0.94.2.dist-info → nkululeko-0.95.0.dist-info}/entry_points.txt +0 -0
- {nkululeko-0.94.2.dist-info → nkululeko-0.95.0.dist-info}/licenses/LICENSE +0 -0
- {nkululeko-0.94.2.dist-info → nkululeko-0.95.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,8 @@
|
|
1
1
|
"""This is a copy of David R. Feinberg's Praat scripts.
|
2
2
|
https://github.com/drfeinberg/PraatScripts
|
3
3
|
taken June 23rd 2022.
|
4
|
+
|
5
|
+
2025-05-06: Optimized for faster computation (bta).
|
4
6
|
"""
|
5
7
|
|
6
8
|
#!/usr/bin/env python3
|
@@ -13,164 +15,340 @@ import pandas as pd
|
|
13
15
|
import parselmouth
|
14
16
|
from parselmouth.praat import call
|
15
17
|
from scipy.stats.mstats import zscore
|
18
|
+
from scipy.stats import lognorm
|
19
|
+
from scipy import stats
|
16
20
|
from sklearn.decomposition import PCA
|
17
21
|
from tqdm import tqdm
|
18
22
|
|
19
|
-
# This is the function to measure source acoustics using default male parameters.
|
20
|
-
|
21
|
-
|
22
|
-
def measure_pitch(voice_id, f0min, f0max, unit):
|
23
|
-
sound = parselmouth.Sound(voice_id) # read the sound
|
24
|
-
duration = call(sound, "Get total duration") # duration
|
25
|
-
pitch = call(sound, "To Pitch", 0.0, f0min, f0max) # create a praat pitch object
|
26
|
-
mean_f0 = call(pitch, "Get mean", 0, 0, unit) # get mean pitch
|
27
|
-
stdev_f0 = call(
|
28
|
-
pitch, "Get standard deviation", 0, 0, unit
|
29
|
-
) # get standard deviation
|
30
|
-
harmonicity = call(sound, "To Harmonicity (cc)", 0.01, f0min, 0.1, 1.0)
|
31
|
-
hnr = call(harmonicity, "Get mean", 0, 0)
|
32
|
-
point_process = call(sound, "To PointProcess (periodic, cc)", f0min, f0max)
|
33
|
-
local_jitter = call(point_process, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3)
|
34
|
-
localabsolute_jitter = call(
|
35
|
-
point_process, "Get jitter (local, absolute)", 0, 0, 0.0001, 0.02, 1.3
|
36
|
-
)
|
37
|
-
rap_jitter = call(point_process, "Get jitter (rap)", 0, 0, 0.0001, 0.02, 1.3)
|
38
|
-
ppq5_jitter = call(point_process, "Get jitter (ppq5)", 0, 0, 0.0001, 0.02, 1.3)
|
39
|
-
ddp_jitter = call(point_process, "Get jitter (ddp)", 0, 0, 0.0001, 0.02, 1.3)
|
40
|
-
local_shimmer = call(
|
41
|
-
[sound, point_process],
|
42
|
-
"Get shimmer (local)",
|
43
|
-
0,
|
44
|
-
0,
|
45
|
-
0.0001,
|
46
|
-
0.02,
|
47
|
-
1.3,
|
48
|
-
1.6,
|
49
|
-
)
|
50
|
-
localdb_shimmer = call(
|
51
|
-
[sound, point_process],
|
52
|
-
"Get shimmer (local_dB)",
|
53
|
-
0,
|
54
|
-
0,
|
55
|
-
0.0001,
|
56
|
-
0.02,
|
57
|
-
1.3,
|
58
|
-
1.6,
|
59
|
-
)
|
60
|
-
apq3_shimmer = call(
|
61
|
-
[sound, point_process],
|
62
|
-
"Get shimmer (apq3)",
|
63
|
-
0,
|
64
|
-
0,
|
65
|
-
0.0001,
|
66
|
-
0.02,
|
67
|
-
1.3,
|
68
|
-
1.6,
|
69
|
-
)
|
70
|
-
aqpq5_shimmer = call(
|
71
|
-
[sound, point_process],
|
72
|
-
"Get shimmer (apq5)",
|
73
|
-
0,
|
74
|
-
0,
|
75
|
-
0.0001,
|
76
|
-
0.02,
|
77
|
-
1.3,
|
78
|
-
1.6,
|
79
|
-
)
|
80
|
-
apq11_shimmer = call(
|
81
|
-
[sound, point_process],
|
82
|
-
"Get shimmer (apq11)",
|
83
|
-
0,
|
84
|
-
0,
|
85
|
-
0.0001,
|
86
|
-
0.02,
|
87
|
-
1.3,
|
88
|
-
1.6,
|
89
|
-
)
|
90
|
-
dda_shimmer = call(
|
91
|
-
[sound, point_process], "Get shimmer (dda)", 0, 0, 0.0001, 0.02, 1.3, 1.6
|
92
|
-
)
|
93
23
|
|
94
|
-
|
95
|
-
|
96
|
-
mean_f0,
|
97
|
-
stdev_f0,
|
98
|
-
hnr,
|
99
|
-
local_jitter,
|
100
|
-
localabsolute_jitter,
|
101
|
-
rap_jitter,
|
102
|
-
ppq5_jitter,
|
103
|
-
ddp_jitter,
|
104
|
-
local_shimmer,
|
105
|
-
localdb_shimmer,
|
106
|
-
apq3_shimmer,
|
107
|
-
aqpq5_shimmer,
|
108
|
-
apq11_shimmer,
|
109
|
-
dda_shimmer,
|
110
|
-
)
|
24
|
+
class AudioFeatureExtractor:
|
25
|
+
"""Optimized audio feature extraction class to avoid redundant calculations."""
|
111
26
|
|
27
|
+
def __init__(self, f0min=75, f0max=300):
|
28
|
+
self.f0min = f0min
|
29
|
+
self.f0max = f0max
|
112
30
|
|
113
|
-
|
114
|
-
|
115
|
-
#
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
31
|
+
def extract_all_features(self, sound):
|
32
|
+
"""Extract all acoustic features from a single sound object."""
|
33
|
+
# Cache common objects to avoid redundant calculations
|
34
|
+
duration = sound.get_total_duration()
|
35
|
+
pitch = call(sound, "To Pitch", 0.0, self.f0min, self.f0max)
|
36
|
+
point_process = call(
|
37
|
+
sound, "To PointProcess (periodic, cc)", self.f0min, self.f0max
|
38
|
+
)
|
39
|
+
|
40
|
+
# Extract pitch-related features
|
41
|
+
pitch_features = self._extract_pitch_features(sound, pitch, point_process)
|
42
|
+
|
43
|
+
# Extract formant features
|
44
|
+
formant_features = self._extract_formant_features(sound, point_process)
|
45
|
+
|
46
|
+
# Extract speech rate and pause features
|
47
|
+
speech_features = self._extract_speech_features(sound)
|
48
|
+
|
49
|
+
# Combine all features
|
50
|
+
all_features = {
|
51
|
+
"duration": duration,
|
52
|
+
**pitch_features,
|
53
|
+
**formant_features,
|
54
|
+
**speech_features,
|
55
|
+
}
|
56
|
+
|
57
|
+
return all_features
|
58
|
+
|
59
|
+
def _extract_pitch_features(self, sound, pitch, point_process):
|
60
|
+
"""Extract pitch, jitter, shimmer, and HNR features."""
|
61
|
+
# Pitch statistics
|
62
|
+
mean_f0 = call(pitch, "Get mean", 0, 0, "Hertz")
|
63
|
+
stdev_f0 = call(pitch, "Get standard deviation", 0, 0, "Hertz")
|
64
|
+
|
65
|
+
# HNR
|
66
|
+
harmonicity = call(sound, "To Harmonicity (cc)", 0.01, self.f0min, 0.1, 1.0)
|
67
|
+
hnr = call(harmonicity, "Get mean", 0, 0)
|
68
|
+
|
69
|
+
# Jitter measures
|
70
|
+
local_jitter = call(
|
71
|
+
point_process, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3
|
72
|
+
)
|
73
|
+
localabsolute_jitter = call(
|
74
|
+
point_process, "Get jitter (local, absolute)", 0, 0, 0.0001, 0.02, 1.3
|
75
|
+
)
|
76
|
+
rap_jitter = call(point_process, "Get jitter (rap)", 0, 0, 0.0001, 0.02, 1.3)
|
77
|
+
ppq5_jitter = call(point_process, "Get jitter (ppq5)", 0, 0, 0.0001, 0.02, 1.3)
|
78
|
+
ddp_jitter = call(point_process, "Get jitter (ddp)", 0, 0, 0.0001, 0.02, 1.3)
|
79
|
+
|
80
|
+
# Shimmer measures (reuse point_process)
|
81
|
+
shimmer_params = [0, 0, 0.0001, 0.02, 1.3, 1.6]
|
82
|
+
local_shimmer = call(
|
83
|
+
[sound, point_process], "Get shimmer (local)", *shimmer_params
|
84
|
+
)
|
85
|
+
localdb_shimmer = call(
|
86
|
+
[sound, point_process], "Get shimmer (local_dB)", *shimmer_params
|
87
|
+
)
|
88
|
+
apq3_shimmer = call(
|
89
|
+
[sound, point_process], "Get shimmer (apq3)", *shimmer_params
|
90
|
+
)
|
91
|
+
apq5_shimmer = call(
|
92
|
+
[sound, point_process], "Get shimmer (apq5)", *shimmer_params
|
93
|
+
)
|
94
|
+
apq11_shimmer = call(
|
95
|
+
[sound, point_process], "Get shimmer (apq11)", *shimmer_params
|
96
|
+
)
|
97
|
+
dda_shimmer = call([sound, point_process], "Get shimmer (dda)", *shimmer_params)
|
98
|
+
|
99
|
+
return {
|
100
|
+
"meanF0Hz": mean_f0,
|
101
|
+
"stdevF0Hz": stdev_f0,
|
102
|
+
"HNR": hnr,
|
103
|
+
"localJitter": local_jitter,
|
104
|
+
"localabsoluteJitter": localabsolute_jitter,
|
105
|
+
"rapJitter": rap_jitter,
|
106
|
+
"ppq5Jitter": ppq5_jitter,
|
107
|
+
"ddpJitter": ddp_jitter,
|
108
|
+
"localShimmer": local_shimmer,
|
109
|
+
"localdbShimmer": localdb_shimmer,
|
110
|
+
"apq3Shimmer": apq3_shimmer,
|
111
|
+
"apq5Shimmer": apq5_shimmer,
|
112
|
+
"apq11Shimmer": apq11_shimmer,
|
113
|
+
"ddaShimmer": dda_shimmer,
|
114
|
+
}
|
115
|
+
|
116
|
+
def _extract_formant_features(self, sound, point_process):
|
117
|
+
"""Extract formant features efficiently."""
|
118
|
+
formants = call(sound, "To Formant (burg)", 0.0025, 5, 5000, 0.025, 50)
|
119
|
+
num_points = call(point_process, "Get number of points")
|
120
|
+
|
121
|
+
# Pre-allocate arrays for better performance
|
122
|
+
f1_values = []
|
123
|
+
f2_values = []
|
124
|
+
f3_values = []
|
125
|
+
f4_values = []
|
126
|
+
|
127
|
+
# Single loop to extract all formants
|
128
|
+
for point in range(num_points):
|
129
|
+
t = call(point_process, "Get time from index", point + 1)
|
130
|
+
f1 = call(formants, "Get value at time", 1, t, "Hertz", "Linear")
|
131
|
+
f2 = call(formants, "Get value at time", 2, t, "Hertz", "Linear")
|
132
|
+
f3 = call(formants, "Get value at time", 3, t, "Hertz", "Linear")
|
133
|
+
f4 = call(formants, "Get value at time", 4, t, "Hertz", "Linear")
|
134
|
+
|
135
|
+
# Filter out NaN values during collection
|
136
|
+
if not math.isnan(f1):
|
137
|
+
f1_values.append(f1)
|
138
|
+
if not math.isnan(f2):
|
139
|
+
f2_values.append(f2)
|
140
|
+
if not math.isnan(f3):
|
141
|
+
f3_values.append(f3)
|
142
|
+
if not math.isnan(f4):
|
143
|
+
f4_values.append(f4)
|
144
|
+
|
145
|
+
# Calculate statistics only once
|
146
|
+
f1_mean = statistics.mean(f1_values) if f1_values else np.nan
|
147
|
+
f2_mean = statistics.mean(f2_values) if f2_values else np.nan
|
148
|
+
f3_mean = statistics.mean(f3_values) if f3_values else np.nan
|
149
|
+
f4_mean = statistics.mean(f4_values) if f4_values else np.nan
|
150
|
+
|
151
|
+
f1_median = statistics.median(f1_values) if f1_values else np.nan
|
152
|
+
f2_median = statistics.median(f2_values) if f2_values else np.nan
|
153
|
+
f3_median = statistics.median(f3_values) if f3_values else np.nan
|
154
|
+
f4_median = statistics.median(f4_values) if f4_values else np.nan
|
155
|
+
|
156
|
+
return {
|
157
|
+
"f1_mean": f1_mean,
|
158
|
+
"f2_mean": f2_mean,
|
159
|
+
"f3_mean": f3_mean,
|
160
|
+
"f4_mean": f4_mean,
|
161
|
+
"f1_median": f1_median,
|
162
|
+
"f2_median": f2_median,
|
163
|
+
"f3_median": f3_median,
|
164
|
+
"f4_median": f4_median,
|
165
|
+
}
|
166
|
+
|
167
|
+
def _extract_speech_features(self, sound):
|
168
|
+
"""Extract speech rate and pause features with lognormal distribution analysis."""
|
169
|
+
silencedb = -25
|
170
|
+
mindip = 2
|
171
|
+
minpause = 0.3
|
172
|
+
originaldur = sound.get_total_duration()
|
173
|
+
|
174
|
+
# Reuse intensity object for multiple calculations
|
175
|
+
intensity = sound.to_intensity(50)
|
176
|
+
max_99_intensity = call(intensity, "Get quantile", 0, 0, 0.99)
|
177
|
+
min_intensity = call(intensity, "Get minimum", 0, 0, "Parabolic")
|
178
|
+
max_intensity = call(intensity, "Get maximum", 0, 0, "Parabolic")
|
179
|
+
|
180
|
+
# Calculate threshold once
|
181
|
+
threshold = max_99_intensity + silencedb
|
182
|
+
threshold2 = max_intensity - max_99_intensity
|
183
|
+
threshold3 = silencedb - threshold2
|
184
|
+
if threshold < min_intensity:
|
185
|
+
threshold = min_intensity
|
186
|
+
|
187
|
+
# Extract silences and calculate pause durations
|
188
|
+
textgrid = call(
|
189
|
+
intensity,
|
190
|
+
"To TextGrid (silences)",
|
191
|
+
threshold3,
|
192
|
+
minpause,
|
193
|
+
0.1,
|
194
|
+
"silent",
|
195
|
+
"sounding",
|
196
|
+
)
|
197
|
+
silencetier = call(textgrid, "Extract tier", 1)
|
198
|
+
silencetable = call(silencetier, "Down to TableOfReal", "sounding")
|
199
|
+
npauses = call(silencetable, "Get number of rows")
|
200
|
+
|
201
|
+
speakingtot = 0
|
202
|
+
pause_durations = []
|
203
|
+
|
204
|
+
# Single loop for speaking time and pause duration calculation
|
205
|
+
for ipause in range(npauses):
|
206
|
+
pause = ipause + 1
|
207
|
+
beginsound = call(silencetable, "Get value", pause, 1)
|
208
|
+
endsound = call(silencetable, "Get value", pause, 2)
|
209
|
+
speakingdur = endsound - beginsound
|
210
|
+
speakingtot += speakingdur
|
211
|
+
|
212
|
+
if ipause > 0:
|
213
|
+
prev_endsound = call(silencetable, "Get value", ipause, 2)
|
214
|
+
pause_duration = beginsound - prev_endsound
|
215
|
+
if pause_duration > 0:
|
216
|
+
pause_durations.append(pause_duration)
|
217
|
+
|
218
|
+
# Calculate pause distribution features
|
219
|
+
pause_features = self._calculate_pause_distribution(pause_durations)
|
220
|
+
|
221
|
+
# Efficient syllable counting
|
222
|
+
syllable_features = self._count_syllables_optimized(
|
223
|
+
sound, intensity, textgrid, threshold, mindip, originaldur
|
224
|
+
)
|
225
|
+
|
226
|
+
pausetot = originaldur - speakingtot
|
227
|
+
proportion_pause_duration = pausetot / speakingtot if speakingtot > 0 else 0
|
228
|
+
|
229
|
+
return {
|
230
|
+
**pause_features,
|
231
|
+
**syllable_features,
|
232
|
+
"proportion_pause_duration": proportion_pause_duration,
|
233
|
+
}
|
234
|
+
|
235
|
+
def _calculate_pause_distribution(self, pause_durations):
|
236
|
+
"""Calculate lognormal distribution parameters for pause durations."""
|
237
|
+
pause_lognorm_mu = np.nan
|
238
|
+
pause_lognorm_sigma = np.nan
|
239
|
+
pause_lognorm_ks_pvalue = np.nan
|
240
|
+
pause_mean_duration = np.nan
|
241
|
+
pause_std_duration = np.nan
|
242
|
+
pause_cv = np.nan
|
243
|
+
|
244
|
+
if len(pause_durations) >= 3:
|
245
|
+
try:
|
246
|
+
pause_durations_array = np.array(pause_durations)
|
247
|
+
pause_mean_duration = np.mean(pause_durations_array)
|
248
|
+
pause_std_duration = np.std(pause_durations_array)
|
249
|
+
pause_cv = (
|
250
|
+
pause_std_duration / pause_mean_duration
|
251
|
+
if pause_mean_duration > 0
|
252
|
+
else 0
|
253
|
+
)
|
254
|
+
|
255
|
+
shape, loc, scale = lognorm.fit(pause_durations_array, floc=0)
|
256
|
+
pause_lognorm_sigma = shape
|
257
|
+
pause_lognorm_mu = np.log(scale)
|
258
|
+
|
259
|
+
ks_stat, pause_lognorm_ks_pvalue = stats.kstest(
|
260
|
+
pause_durations_array,
|
261
|
+
lambda x: lognorm.cdf(x, shape, loc=loc, scale=scale),
|
262
|
+
)
|
263
|
+
except (ValueError, RuntimeError) as e:
|
264
|
+
print(f"Error fitting lognormal distribution: {e}")
|
265
|
+
|
266
|
+
return {
|
267
|
+
"pause_lognorm_mu": pause_lognorm_mu,
|
268
|
+
"pause_lognorm_sigma": pause_lognorm_sigma,
|
269
|
+
"pause_lognorm_ks_pvalue": pause_lognorm_ks_pvalue,
|
270
|
+
"pause_mean_duration": pause_mean_duration,
|
271
|
+
"pause_std_duration": pause_std_duration,
|
272
|
+
"pause_cv": pause_cv,
|
273
|
+
}
|
274
|
+
|
275
|
+
def _count_syllables_optimized(
|
276
|
+
self, sound, intensity, textgrid, threshold, mindip, originaldur
|
277
|
+
):
|
278
|
+
"""Optimized syllable counting avoiding redundant matrix operations."""
|
279
|
+
intensity_matrix = call(intensity, "Down to Matrix")
|
280
|
+
sound_from_intensity_matrix = call(intensity_matrix, "To Sound (slice)", 1)
|
281
|
+
intensity_duration = call(sound_from_intensity_matrix, "Get total duration")
|
282
|
+
|
283
|
+
point_process = call(
|
284
|
+
sound_from_intensity_matrix,
|
285
|
+
"To PointProcess (extrema)",
|
286
|
+
"Left",
|
287
|
+
"yes",
|
288
|
+
"no",
|
289
|
+
"Sinc70",
|
290
|
+
)
|
291
|
+
numpeaks = call(point_process, "Get number of points")
|
292
|
+
|
293
|
+
# Vectorized time extraction
|
294
|
+
timepeaks = []
|
295
|
+
intensities = []
|
296
|
+
|
297
|
+
for i in range(numpeaks):
|
298
|
+
t = call(point_process, "Get time from index", i + 1)
|
299
|
+
value = call(sound_from_intensity_matrix, "Get value at time", t, "Cubic")
|
300
|
+
if value > threshold:
|
301
|
+
timepeaks.append(t)
|
302
|
+
intensities.append(value)
|
303
|
+
|
304
|
+
# Optimized peak validation
|
305
|
+
validtime = []
|
306
|
+
if len(timepeaks) > 1:
|
307
|
+
for p in range(len(timepeaks) - 1):
|
308
|
+
currenttime = timepeaks[p]
|
309
|
+
currentint = intensities[p]
|
310
|
+
dip = call(
|
311
|
+
intensity, "Get minimum", currenttime, timepeaks[p + 1], "None"
|
312
|
+
)
|
313
|
+
if abs(currentint - dip) > mindip:
|
314
|
+
validtime.append(timepeaks[p])
|
315
|
+
|
316
|
+
# Count voiced syllables
|
317
|
+
pitch = sound.to_pitch_ac(0.02, 30, 4, False, 0.03, 0.25, 0.01, 0.35, 0.25, 450)
|
318
|
+
voicedcount = 0
|
319
|
+
|
320
|
+
for querytime in validtime:
|
321
|
+
whichinterval = call(textgrid, "Get interval at time", 1, querytime)
|
322
|
+
whichlabel = call(textgrid, "Get label of interval", 1, whichinterval)
|
323
|
+
pitch_value = pitch.get_value_at_time(querytime)
|
324
|
+
if not math.isnan(pitch_value) and whichlabel == "sounding":
|
325
|
+
voicedcount += 1
|
326
|
+
|
327
|
+
# Get silencetable for speaking time calculation
|
328
|
+
silencetier = call(textgrid, "Extract tier", 1)
|
329
|
+
silencetable = call(silencetier, "Down to TableOfReal", "sounding")
|
330
|
+
npauses = call(silencetable, "Get number of rows")
|
331
|
+
|
332
|
+
# Calculate speaking time
|
333
|
+
speakingtot = 0
|
334
|
+
for i in range(npauses):
|
335
|
+
beginsound = call(silencetable, "Get value", i + 1, 1)
|
336
|
+
endsound = call(silencetable, "Get value", i + 1, 2)
|
337
|
+
speakingtot += endsound - beginsound
|
338
|
+
|
339
|
+
# Calculate rates
|
340
|
+
speakingrate = voicedcount / originaldur
|
341
|
+
articulationrate = voicedcount / speakingtot if speakingtot > 0 else 0
|
342
|
+
asd = speakingtot / voicedcount if voicedcount > 0 else 0
|
343
|
+
|
344
|
+
return {
|
345
|
+
"nsyll": voicedcount,
|
346
|
+
"npause": npauses - 1,
|
347
|
+
"phonationtime_s": intensity_duration,
|
348
|
+
"speechrate_nsyll_dur": speakingrate,
|
349
|
+
"articulation_rate_nsyll_phonationtime": articulationrate,
|
350
|
+
"ASD_speakingtime_nsyll": asd,
|
351
|
+
}
|
174
352
|
|
175
353
|
|
176
354
|
# ## This function runs a 2-factor Principle Components Analysis (PCA) on Jitter and Shimmer
|
@@ -227,231 +405,126 @@ def run_pca(df):
|
|
227
405
|
|
228
406
|
|
229
407
|
def compute_features(file_index):
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
# Go through all the wave files in the folder and measure all the acoustics
|
255
|
-
# for i, wave_file in enumerate(file_list):
|
408
|
+
"""Optimized feature computation using AudioFeatureExtractor class.
|
409
|
+
|
410
|
+
FEATURE COUNT COMPARISON:
|
411
|
+
Original version: ~36 features
|
412
|
+
- Basic: duration, meanF0Hz, stdevF0Hz, HNR (4)
|
413
|
+
- Jitter: localJitter, localabsoluteJitter, rapJitter, ppq5Jitter, ddpJitter (5)
|
414
|
+
- Shimmer: localShimmer, localdbShimmer, apq3Shimmer, apq5Shimmer, apq11Shimmer, ddaShimmer (6)
|
415
|
+
- Formants: f1-f4 mean/median (8)
|
416
|
+
- PCA: JitterPCA, ShimmerPCA (2)
|
417
|
+
- VTL: pF, fdisp, avgFormant, mff, fitch_vtl, delta_f, vtl_delta_f (7)
|
418
|
+
- Speech rate: nsyll, npause, phonationtime_s, speechrate_nsyll_dur,
|
419
|
+
articulation_rate_nsyll_phonationtime, ASD_speakingtime_nsyll (6)
|
420
|
+
|
421
|
+
Current optimized version: ~42 features (+6 new pause distribution features)
|
422
|
+
- All original 36 features PLUS:
|
423
|
+
- Pause distribution: pause_lognorm_mu, pause_lognorm_sigma, pause_lognorm_ks_pvalue,
|
424
|
+
pause_mean_duration, pause_std_duration, pause_cv (6)
|
425
|
+
- Additional: proportion_pause_duration (1)
|
426
|
+
|
427
|
+
Total: 43 features (7 new features added for AD detection)
|
428
|
+
"""
|
429
|
+
extractor = AudioFeatureExtractor()
|
430
|
+
feature_list = []
|
431
|
+
|
256
432
|
for idx, (wave_file, start, end) in enumerate(tqdm(file_index.to_list())):
|
257
|
-
signal, sampling_rate = audiofile.read(
|
258
|
-
wave_file,
|
259
|
-
offset=start.total_seconds(),
|
260
|
-
duration=(end - start).total_seconds(),
|
261
|
-
always_2d=True,
|
262
|
-
)
|
263
433
|
try:
|
434
|
+
signal, sampling_rate = audiofile.read(
|
435
|
+
wave_file,
|
436
|
+
offset=start.total_seconds(),
|
437
|
+
duration=(end - start).total_seconds(),
|
438
|
+
always_2d=True,
|
439
|
+
)
|
264
440
|
sound = parselmouth.Sound(values=signal, sampling_frequency=sampling_rate)
|
265
|
-
(
|
266
|
-
duration,
|
267
|
-
mean_f0,
|
268
|
-
stdev_f0,
|
269
|
-
hnr,
|
270
|
-
local_jitter,
|
271
|
-
localabsolute_jitter,
|
272
|
-
rap_jitter,
|
273
|
-
ppq5_jitter,
|
274
|
-
ddp_jitter,
|
275
|
-
local_shimmer,
|
276
|
-
localdb_shimmer,
|
277
|
-
apq3_shimmer,
|
278
|
-
aqpq5_shimmer,
|
279
|
-
apq11_shimmer,
|
280
|
-
dda_shimmer,
|
281
|
-
) = measure_pitch(sound, 75, 300, "Hertz")
|
282
|
-
(
|
283
|
-
f1_mean,
|
284
|
-
f2_mean,
|
285
|
-
f3_mean,
|
286
|
-
f4_mean,
|
287
|
-
f1_median,
|
288
|
-
f2_median,
|
289
|
-
f3_median,
|
290
|
-
f4_median,
|
291
|
-
) = measure_formants(sound, 75, 300)
|
292
|
-
# file_list.append(wave_file) # make an ID list
|
293
|
-
except (statistics.StatisticsError, parselmouth.PraatError) as errors:
|
294
|
-
print(f"error on file {wave_file}: {errors}")
|
295
441
|
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
hnr_list.append(hnr) # add HNR data
|
300
|
-
|
301
|
-
# add raw jitter and shimmer measures
|
302
|
-
local_jitter_list.append(local_jitter)
|
303
|
-
localabsolute_jitter_list.append(localabsolute_jitter)
|
304
|
-
rap_jitter_list.append(rap_jitter)
|
305
|
-
ppq5_jitter_list.append(ppq5_jitter)
|
306
|
-
ddp_jitter_list.append(ddp_jitter)
|
307
|
-
local_shimmer_list.append(local_shimmer)
|
308
|
-
localdb_shimmer_list.append(localdb_shimmer)
|
309
|
-
apq3_shimmer_list.append(apq3_shimmer)
|
310
|
-
aqpq5_shimmer_list.append(aqpq5_shimmer)
|
311
|
-
apq11_shimmer_list.append(apq11_shimmer)
|
312
|
-
dda_shimmer_list.append(dda_shimmer)
|
313
|
-
|
314
|
-
# add the formant data
|
315
|
-
f1_mean_list.append(f1_mean)
|
316
|
-
f2_mean_list.append(f2_mean)
|
317
|
-
f3_mean_list.append(f3_mean)
|
318
|
-
f4_mean_list.append(f4_mean)
|
319
|
-
f1_median_list.append(f1_median)
|
320
|
-
f2_median_list.append(f2_median)
|
321
|
-
f3_median_list.append(f3_median)
|
322
|
-
f4_median_list.append(f4_median)
|
323
|
-
# ## This block of code adds all of that data we just generated to a Pandas data frame
|
324
|
-
# Add the data to Pandas
|
325
|
-
df = pd.DataFrame(
|
326
|
-
np.column_stack(
|
327
|
-
[
|
328
|
-
duration_list,
|
329
|
-
mean_f0_list,
|
330
|
-
sd_f0_list,
|
331
|
-
hnr_list,
|
332
|
-
local_jitter_list,
|
333
|
-
localabsolute_jitter_list,
|
334
|
-
rap_jitter_list,
|
335
|
-
ppq5_jitter_list,
|
336
|
-
ddp_jitter_list,
|
337
|
-
local_shimmer_list,
|
338
|
-
localdb_shimmer_list,
|
339
|
-
apq3_shimmer_list,
|
340
|
-
aqpq5_shimmer_list,
|
341
|
-
apq11_shimmer_list,
|
342
|
-
dda_shimmer_list,
|
343
|
-
f1_mean_list,
|
344
|
-
f2_mean_list,
|
345
|
-
f3_mean_list,
|
346
|
-
f4_mean_list,
|
347
|
-
f1_median_list,
|
348
|
-
f2_median_list,
|
349
|
-
f3_median_list,
|
350
|
-
f4_median_list,
|
351
|
-
]
|
352
|
-
),
|
353
|
-
columns=[
|
354
|
-
"duration",
|
355
|
-
"meanF0Hz",
|
356
|
-
"stdevF0Hz",
|
357
|
-
"HNR",
|
358
|
-
"localJitter",
|
359
|
-
"localabsoluteJitter",
|
360
|
-
"rapJitter",
|
361
|
-
"ppq5Jitter",
|
362
|
-
"ddpJitter",
|
363
|
-
"localShimmer",
|
364
|
-
"localdbShimmer",
|
365
|
-
"apq3Shimmer",
|
366
|
-
"apq5Shimmer",
|
367
|
-
"apq11Shimmer",
|
368
|
-
"ddaShimmer",
|
369
|
-
"f1_mean",
|
370
|
-
"f2_mean",
|
371
|
-
"f3_mean",
|
372
|
-
"f4_mean",
|
373
|
-
"f1_median",
|
374
|
-
"f2_median",
|
375
|
-
"f3_median",
|
376
|
-
"f4_median",
|
377
|
-
],
|
378
|
-
)
|
442
|
+
# Extract all features in one pass
|
443
|
+
features = extractor.extract_all_features(sound)
|
444
|
+
feature_list.append(features)
|
379
445
|
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
#
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
) ** 0.25
|
417
|
-
|
418
|
-
# ### Fitch VTL
|
419
|
-
# Fitch, W. T. (1997). Vocal tract length and formant frequency dispersion correlate with body size in rhesus macaques. The Journal of the Acoustical Society of America, 102(2), 1213-1222.
|
420
|
-
|
421
|
-
# reload the data again
|
422
|
-
# df.to_csv("processed_results.csv", index=False)
|
423
|
-
# df = pd.read_csv('processed_results.csv', header=0)
|
424
|
-
|
425
|
-
df["fitch_vtl"] = (
|
426
|
-
(1 * (35000 / (4 * df["f1_median"])))
|
427
|
-
+ (3 * (35000 / (4 * df["f2_median"])))
|
428
|
-
+ (5 * (35000 / (4 * df["f3_median"])))
|
429
|
-
+ (7 * (35000 / (4 * df["f4_median"])))
|
430
|
-
) / 4
|
431
|
-
|
432
|
-
# ### $\Delta$F
|
433
|
-
# Reby,D.,& McComb,K.(2003). Anatomical constraints generate honesty: acoustic cues to age and weight in the roars of red deer stags. Animal Behaviour, 65, 519e-530.
|
434
|
-
|
435
|
-
xysum = (
|
436
|
-
(0.5 * df["f1_median"])
|
437
|
-
+ (1.5 * df["f2_median"])
|
438
|
-
+ (2.5 * df["f3_median"])
|
439
|
-
+ (3.5 * df["f4_median"])
|
446
|
+
except Exception as errors:
|
447
|
+
print(f"error on file {wave_file}: {errors}")
|
448
|
+
# Add empty feature dict for failed files
|
449
|
+
feature_list.append(
|
450
|
+
{
|
451
|
+
key: np.nan
|
452
|
+
for key in ["duration", "meanF0Hz", "stdevF0Hz", "HNR"]
|
453
|
+
+ [
|
454
|
+
f"f{i}_{stat}"
|
455
|
+
for i in range(1, 5)
|
456
|
+
for stat in ["mean", "median"]
|
457
|
+
]
|
458
|
+
+ [
|
459
|
+
"localJitter",
|
460
|
+
"localabsoluteJitter",
|
461
|
+
"rapJitter",
|
462
|
+
"ppq5Jitter",
|
463
|
+
"ddpJitter",
|
464
|
+
"localShimmer",
|
465
|
+
"localdbShimmer",
|
466
|
+
"apq3Shimmer",
|
467
|
+
"apq5Shimmer",
|
468
|
+
"apq11Shimmer",
|
469
|
+
"ddaShimmer",
|
470
|
+
]
|
471
|
+
}
|
472
|
+
)
|
473
|
+
|
474
|
+
# Create DataFrame directly from feature list
|
475
|
+
df = pd.DataFrame(feature_list)
|
476
|
+
|
477
|
+
# Add derived features efficiently
|
478
|
+
df = add_derived_features(df)
|
479
|
+
|
480
|
+
print(
|
481
|
+
f"Feature extraction completed. Total features extracted: {len(df.columns) if 'df' in locals() else '~43'}"
|
440
482
|
)
|
441
|
-
|
442
|
-
df["delta_f"] = xysum / xsquaredsum
|
443
|
-
|
444
|
-
# ### VTL($\Delta$F)
|
445
|
-
# Reby,D.,&McComb,K.(2003).Anatomical constraints generate honesty: acoustic cues to age and weight in the roars of red deer stags. Animal Behaviour, 65, 519e-530.
|
446
|
-
|
447
|
-
df["vtl_delta_f"] = 35000 / (2 * df["delta_f"])
|
483
|
+
return df
|
448
484
|
|
449
|
-
print("Now extracting speech rate parameters...")
|
450
485
|
|
451
|
-
|
452
|
-
|
486
|
+
def add_derived_features(df):
|
487
|
+
"""Add PCA and vocal tract length features efficiently."""
|
488
|
+
# PCA on jitter/shimmer
|
489
|
+
pca_data = run_pca(df)
|
490
|
+
df = pd.concat([df, pca_data], axis=1)
|
491
|
+
|
492
|
+
# Vectorized vocal tract calculations
|
493
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
494
|
+
df["pF"] = (
|
495
|
+
zscore(df.f1_median)
|
496
|
+
+ zscore(df.f2_median)
|
497
|
+
+ zscore(df.f3_median)
|
498
|
+
+ zscore(df.f4_median)
|
499
|
+
) / 4
|
500
|
+
df["fdisp"] = (df["f4_median"] - df["f1_median"]) / 3
|
501
|
+
df["avgFormant"] = (
|
502
|
+
df["f1_median"] + df["f2_median"] + df["f3_median"] + df["f4_median"]
|
503
|
+
) / 4
|
504
|
+
df["mff"] = (
|
505
|
+
df["f1_median"] * df["f2_median"] * df["f3_median"] * df["f4_median"]
|
506
|
+
) ** 0.25
|
507
|
+
|
508
|
+
# Fitch VTL calculation
|
509
|
+
df["fitch_vtl"] = (
|
510
|
+
(1 * (35000 / (4 * df["f1_median"])))
|
511
|
+
+ (3 * (35000 / (4 * df["f2_median"])))
|
512
|
+
+ (5 * (35000 / (4 * df["f3_median"])))
|
513
|
+
+ (7 * (35000 / (4 * df["f4_median"])))
|
514
|
+
) / 4
|
515
|
+
|
516
|
+
# Delta F calculation
|
517
|
+
xysum = (
|
518
|
+
0.5 * df["f1_median"]
|
519
|
+
+ 1.5 * df["f2_median"]
|
520
|
+
+ 2.5 * df["f3_median"]
|
521
|
+
+ 3.5 * df["f4_median"]
|
522
|
+
)
|
523
|
+
xsquaredsum = 0.5**2 + 1.5**2 + 2.5**2 + 3.5**2
|
524
|
+
df["delta_f"] = xysum / xsquaredsum
|
525
|
+
df["vtl_delta_f"] = 35000 / (2 * df["delta_f"])
|
453
526
|
|
454
|
-
return df
|
527
|
+
return df
|
455
528
|
|
456
529
|
|
457
530
|
"""
|
@@ -469,6 +542,12 @@ def get_speech_rate(file_index):
|
|
469
542
|
"speechrate_nsyll_dur",
|
470
543
|
"articulation_rate_nsyll_phonationtime",
|
471
544
|
"ASD_speakingtime_nsyll",
|
545
|
+
"pause_lognorm_mu",
|
546
|
+
"pause_lognorm_sigma",
|
547
|
+
"pause_lognorm_ks_pvalue",
|
548
|
+
"pause_mean_duration",
|
549
|
+
"pause_std_duration",
|
550
|
+
"pause_cv",
|
472
551
|
]
|
473
552
|
datalist = []
|
474
553
|
for idx, (wave_file, start, end) in enumerate(tqdm(file_index.to_list())):
|
@@ -527,6 +606,8 @@ def speech_rate(sound):
|
|
527
606
|
silencetable = call(silencetier, "Down to TableOfReal", "sounding")
|
528
607
|
npauses = call(silencetable, "Get number of rows")
|
529
608
|
speakingtot = 0
|
609
|
+
pause_durations = [] # Store individual pause durations
|
610
|
+
|
530
611
|
for ipause in range(npauses):
|
531
612
|
pause = ipause + 1
|
532
613
|
beginsound = call(silencetable, "Get value", pause, 1)
|
@@ -534,6 +615,53 @@ def speech_rate(sound):
|
|
534
615
|
speakingdur = endsound - beginsound
|
535
616
|
speakingtot += speakingdur
|
536
617
|
|
618
|
+
# Calculate pause duration (time between speaking segments)
|
619
|
+
if ipause > 0:
|
620
|
+
prev_pause = ipause
|
621
|
+
prev_endsound = call(silencetable, "Get value", prev_pause, 2)
|
622
|
+
pause_duration = beginsound - prev_endsound
|
623
|
+
if pause_duration > 0: # Only include positive pause durations
|
624
|
+
pause_durations.append(pause_duration)
|
625
|
+
|
626
|
+
# Calculate pause duration distribution parameters
|
627
|
+
pause_lognorm_mu = np.nan
|
628
|
+
pause_lognorm_sigma = np.nan
|
629
|
+
pause_lognorm_ks_pvalue = np.nan
|
630
|
+
pause_mean_duration = np.nan
|
631
|
+
pause_std_duration = np.nan
|
632
|
+
pause_cv = np.nan
|
633
|
+
|
634
|
+
if len(pause_durations) >= 3: # Need minimum samples for distribution fitting
|
635
|
+
try:
|
636
|
+
# Fit lognormal distribution to pause durations
|
637
|
+
pause_durations_array = np.array(pause_durations)
|
638
|
+
|
639
|
+
# Calculate basic statistics
|
640
|
+
pause_mean_duration = np.mean(pause_durations_array)
|
641
|
+
pause_std_duration = np.std(pause_durations_array)
|
642
|
+
pause_cv = (
|
643
|
+
pause_std_duration / pause_mean_duration
|
644
|
+
if pause_mean_duration > 0
|
645
|
+
else 0
|
646
|
+
)
|
647
|
+
|
648
|
+
# Fit lognormal distribution
|
649
|
+
shape, loc, scale = lognorm.fit(pause_durations_array, floc=0)
|
650
|
+
pause_lognorm_sigma = shape # shape parameter (sigma)
|
651
|
+
pause_lognorm_mu = np.log(scale) # location parameter (mu)
|
652
|
+
|
653
|
+
# Test goodness of fit using Kolmogorov-Smirnov test
|
654
|
+
ks_stat, pause_lognorm_ks_pvalue = stats.kstest(
|
655
|
+
pause_durations_array,
|
656
|
+
lambda x: lognorm.cdf(x, shape, loc=loc, scale=scale),
|
657
|
+
)
|
658
|
+
|
659
|
+
except (ValueError, RuntimeError) as e:
|
660
|
+
print(f"Error fitting lognormal distribution to pause durations: {e}")
|
661
|
+
|
662
|
+
# Calculate pause duration
|
663
|
+
pausetot = originaldur - speakingtot
|
664
|
+
|
537
665
|
intensity_matrix = call(intensity, "Down to Matrix")
|
538
666
|
# sndintid = sound_from_intensity_matrix
|
539
667
|
sound_from_intensity_matrix = call(intensity_matrix, "To Sound (slice)", 1)
|
@@ -567,8 +695,8 @@ def speech_rate(sound):
|
|
567
695
|
# fill array with valid peaks: only intensity values if preceding
|
568
696
|
# dip in intensity is greater than mindip
|
569
697
|
validpeakcount = 0
|
570
|
-
currenttime = timepeaks[0]
|
571
|
-
currentint = intensities[0]
|
698
|
+
currenttime = timepeaks[0] if timepeaks else 0
|
699
|
+
currentint = intensities[0] if intensities else 0
|
572
700
|
validtime = []
|
573
701
|
|
574
702
|
for p in range(peakcount - 1):
|
@@ -609,20 +737,35 @@ def speech_rate(sound):
|
|
609
737
|
|
610
738
|
# return results
|
611
739
|
speakingrate = voicedcount / originaldur
|
612
|
-
articulationrate = voicedcount / speakingtot
|
740
|
+
articulationrate = voicedcount / speakingtot if speakingtot > 0 else 0
|
613
741
|
npause = npauses - 1
|
614
742
|
try:
|
615
743
|
asd = speakingtot / voicedcount
|
616
744
|
except ZeroDivisionError:
|
617
745
|
asd = 0
|
618
746
|
print("caught zero division")
|
747
|
+
|
748
|
+
# Calculate proportion pause duration
|
749
|
+
try:
|
750
|
+
proportion_pause_duration = pausetot / speakingtot
|
751
|
+
except ZeroDivisionError:
|
752
|
+
proportion_pause_duration = 0
|
753
|
+
print("caught zero division for proportion pause duration")
|
754
|
+
|
619
755
|
speechrate_dictionary = {
|
620
756
|
"nsyll": voicedcount,
|
621
757
|
"npause": npause,
|
622
|
-
"dur_s": originaldur,
|
758
|
+
# "dur_s": originaldur,
|
623
759
|
"phonationtime_s": intensity_duration,
|
624
760
|
"speechrate_nsyll_dur": speakingrate,
|
625
761
|
"articulation_rate_nsyll_phonationtime": articulationrate,
|
626
762
|
"ASD_speakingtime_nsyll": asd,
|
763
|
+
"proportion_pause_duration": proportion_pause_duration,
|
764
|
+
"pause_lognorm_mu": pause_lognorm_mu,
|
765
|
+
"pause_lognorm_sigma": pause_lognorm_sigma,
|
766
|
+
"pause_lognorm_ks_pvalue": pause_lognorm_ks_pvalue,
|
767
|
+
"pause_mean_duration": pause_mean_duration,
|
768
|
+
"pause_std_duration": pause_std_duration,
|
769
|
+
"pause_cv": pause_cv,
|
627
770
|
}
|
628
771
|
return speechrate_dictionary
|