nkululeko 0.94.3__py3-none-any.whl → 0.95.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nkululeko/augmenting/resampler.py +5 -2
- nkululeko/autopredict/ap_emotion.py +36 -0
- nkululeko/autopredict/ap_text.py +45 -0
- nkululeko/autopredict/tests/__init__.py +0 -0
- nkululeko/autopredict/tests/test_whisper_transcriber.py +122 -0
- nkululeko/autopredict/whisper_transcriber.py +81 -0
- nkululeko/balance.py +222 -0
- nkululeko/constants.py +1 -1
- nkululeko/experiment.py +53 -3
- nkululeko/explore.py +32 -13
- nkululeko/feat_extract/feats_analyser.py +45 -17
- nkululeko/feat_extract/feats_emotion2vec.py +51 -26
- nkululeko/feat_extract/feats_praat.py +3 -3
- nkululeko/feat_extract/feats_praat_core.py +769 -0
- nkululeko/feat_extract/tests/__init__.py +1 -0
- nkululeko/feat_extract/tests/test_feats_opensmile.py +162 -0
- nkululeko/feat_extract/tests/test_feats_praat_core.py +507 -0
- nkululeko/glob_conf.py +9 -0
- nkululeko/modelrunner.py +15 -39
- nkululeko/models/model.py +4 -42
- nkululeko/models/model_tuned.py +416 -84
- nkululeko/models/model_xgb.py +148 -2
- nkululeko/models/tests/test_model_knn.py +49 -0
- nkululeko/models/tests/test_model_mlp.py +153 -0
- nkululeko/models/tests/test_model_xgb.py +33 -0
- nkululeko/nkululeko.py +0 -9
- nkululeko/plots.py +25 -19
- nkululeko/predict.py +8 -6
- nkululeko/reporting/report.py +7 -5
- nkululeko/reporting/reporter.py +20 -5
- nkululeko/test_predictor.py +7 -1
- nkululeko/tests/__init__.py +1 -0
- nkululeko/tests/test_balancing.py +270 -0
- nkululeko/utils/util.py +38 -6
- {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/METADATA +1 -1
- {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/RECORD +40 -27
- nkululeko/feat_extract/feats_opensmile copy.py +0 -93
- nkululeko/feat_extract/feinberg_praat.py +0 -628
- {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/WHEEL +0 -0
- {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/entry_points.txt +0 -0
- {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/licenses/LICENSE +0 -0
- {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,769 @@
|
|
1
|
+
"""This is a copy of David R. Feinberg's Praat scripts.
|
2
|
+
https://github.com/drfeinberg/PraatScripts
|
3
|
+
taken June 23rd 2022.
|
4
|
+
|
5
|
+
2025-05-06: Optimized for faster computation (bta).
|
6
|
+
"""
|
7
|
+
|
8
|
+
#!/usr/bin/env python3
|
9
|
+
import math
|
10
|
+
import statistics
|
11
|
+
|
12
|
+
import audiofile
|
13
|
+
import numpy as np
|
14
|
+
import pandas as pd
|
15
|
+
import parselmouth
|
16
|
+
from parselmouth.praat import call
|
17
|
+
from scipy.stats.mstats import zscore
|
18
|
+
from scipy.stats import lognorm
|
19
|
+
from scipy import stats
|
20
|
+
from sklearn.decomposition import PCA
|
21
|
+
from tqdm import tqdm
|
22
|
+
|
23
|
+
|
24
|
+
class AudioFeatureExtractor:
|
25
|
+
"""Optimized audio feature extraction class to avoid redundant calculations."""
|
26
|
+
|
27
|
+
def __init__(self, f0min=75, f0max=300):
|
28
|
+
self.f0min = f0min
|
29
|
+
self.f0max = f0max
|
30
|
+
|
31
|
+
def extract_all_features(self, sound):
|
32
|
+
"""Extract all acoustic features from a single sound object."""
|
33
|
+
# Cache common objects to avoid redundant calculations
|
34
|
+
duration = sound.get_total_duration()
|
35
|
+
pitch = call(sound, "To Pitch", 0.0, self.f0min, self.f0max)
|
36
|
+
point_process = call(
|
37
|
+
sound, "To PointProcess (periodic, cc)", self.f0min, self.f0max
|
38
|
+
)
|
39
|
+
|
40
|
+
# Extract pitch-related features
|
41
|
+
pitch_features = self._extract_pitch_features(sound, pitch, point_process)
|
42
|
+
|
43
|
+
# Extract formant features
|
44
|
+
formant_features = self._extract_formant_features(sound, point_process)
|
45
|
+
|
46
|
+
# Extract speech rate and pause features
|
47
|
+
speech_features = self._extract_speech_features(sound)
|
48
|
+
|
49
|
+
# Combine all features
|
50
|
+
all_features = {
|
51
|
+
"duration": duration,
|
52
|
+
**pitch_features,
|
53
|
+
**formant_features,
|
54
|
+
**speech_features,
|
55
|
+
}
|
56
|
+
|
57
|
+
return all_features
|
58
|
+
|
59
|
+
def _extract_pitch_features(self, sound, pitch, point_process):
|
60
|
+
"""Extract pitch, jitter, shimmer, and HNR features."""
|
61
|
+
# Pitch statistics
|
62
|
+
mean_f0 = call(pitch, "Get mean", 0, 0, "Hertz")
|
63
|
+
stdev_f0 = call(pitch, "Get standard deviation", 0, 0, "Hertz")
|
64
|
+
|
65
|
+
# HNR
|
66
|
+
harmonicity = call(sound, "To Harmonicity (cc)", 0.01, self.f0min, 0.1, 1.0)
|
67
|
+
hnr = call(harmonicity, "Get mean", 0, 0)
|
68
|
+
|
69
|
+
# Jitter measures
|
70
|
+
local_jitter = call(
|
71
|
+
point_process, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3
|
72
|
+
)
|
73
|
+
localabsolute_jitter = call(
|
74
|
+
point_process, "Get jitter (local, absolute)", 0, 0, 0.0001, 0.02, 1.3
|
75
|
+
)
|
76
|
+
rap_jitter = call(point_process, "Get jitter (rap)", 0, 0, 0.0001, 0.02, 1.3)
|
77
|
+
ppq5_jitter = call(point_process, "Get jitter (ppq5)", 0, 0, 0.0001, 0.02, 1.3)
|
78
|
+
ddp_jitter = call(point_process, "Get jitter (ddp)", 0, 0, 0.0001, 0.02, 1.3)
|
79
|
+
|
80
|
+
# Shimmer measures (reuse point_process)
|
81
|
+
shimmer_params = [0, 0, 0.0001, 0.02, 1.3, 1.6]
|
82
|
+
local_shimmer = call(
|
83
|
+
[sound, point_process], "Get shimmer (local)", *shimmer_params
|
84
|
+
)
|
85
|
+
localdb_shimmer = call(
|
86
|
+
[sound, point_process], "Get shimmer (local_dB)", *shimmer_params
|
87
|
+
)
|
88
|
+
apq3_shimmer = call(
|
89
|
+
[sound, point_process], "Get shimmer (apq3)", *shimmer_params
|
90
|
+
)
|
91
|
+
apq5_shimmer = call(
|
92
|
+
[sound, point_process], "Get shimmer (apq5)", *shimmer_params
|
93
|
+
)
|
94
|
+
apq11_shimmer = call(
|
95
|
+
[sound, point_process], "Get shimmer (apq11)", *shimmer_params
|
96
|
+
)
|
97
|
+
dda_shimmer = call([sound, point_process], "Get shimmer (dda)", *shimmer_params)
|
98
|
+
|
99
|
+
return {
|
100
|
+
"meanF0Hz": mean_f0,
|
101
|
+
"stdevF0Hz": stdev_f0,
|
102
|
+
"HNR": hnr,
|
103
|
+
"localJitter": local_jitter,
|
104
|
+
"localabsoluteJitter": localabsolute_jitter,
|
105
|
+
"rapJitter": rap_jitter,
|
106
|
+
"ppq5Jitter": ppq5_jitter,
|
107
|
+
"ddpJitter": ddp_jitter,
|
108
|
+
"localShimmer": local_shimmer,
|
109
|
+
"localdbShimmer": localdb_shimmer,
|
110
|
+
"apq3Shimmer": apq3_shimmer,
|
111
|
+
"apq5Shimmer": apq5_shimmer,
|
112
|
+
"apq11Shimmer": apq11_shimmer,
|
113
|
+
"ddaShimmer": dda_shimmer,
|
114
|
+
}
|
115
|
+
|
116
|
+
def _extract_formant_features(self, sound, point_process):
|
117
|
+
"""Extract formant features efficiently."""
|
118
|
+
formants = call(sound, "To Formant (burg)", 0.0025, 5, 5000, 0.025, 50)
|
119
|
+
num_points = call(point_process, "Get number of points")
|
120
|
+
|
121
|
+
# Pre-allocate arrays for better performance
|
122
|
+
f1_values = []
|
123
|
+
f2_values = []
|
124
|
+
f3_values = []
|
125
|
+
f4_values = []
|
126
|
+
|
127
|
+
# Single loop to extract all formants
|
128
|
+
for point in range(num_points):
|
129
|
+
t = call(point_process, "Get time from index", point + 1)
|
130
|
+
f1 = call(formants, "Get value at time", 1, t, "Hertz", "Linear")
|
131
|
+
f2 = call(formants, "Get value at time", 2, t, "Hertz", "Linear")
|
132
|
+
f3 = call(formants, "Get value at time", 3, t, "Hertz", "Linear")
|
133
|
+
f4 = call(formants, "Get value at time", 4, t, "Hertz", "Linear")
|
134
|
+
|
135
|
+
# Filter out NaN values during collection
|
136
|
+
if not math.isnan(f1):
|
137
|
+
f1_values.append(f1)
|
138
|
+
if not math.isnan(f2):
|
139
|
+
f2_values.append(f2)
|
140
|
+
if not math.isnan(f3):
|
141
|
+
f3_values.append(f3)
|
142
|
+
if not math.isnan(f4):
|
143
|
+
f4_values.append(f4)
|
144
|
+
|
145
|
+
# Calculate statistics only once
|
146
|
+
f1_mean = statistics.mean(f1_values) if f1_values else np.nan
|
147
|
+
f2_mean = statistics.mean(f2_values) if f2_values else np.nan
|
148
|
+
f3_mean = statistics.mean(f3_values) if f3_values else np.nan
|
149
|
+
f4_mean = statistics.mean(f4_values) if f4_values else np.nan
|
150
|
+
|
151
|
+
f1_median = statistics.median(f1_values) if f1_values else np.nan
|
152
|
+
f2_median = statistics.median(f2_values) if f2_values else np.nan
|
153
|
+
f3_median = statistics.median(f3_values) if f3_values else np.nan
|
154
|
+
f4_median = statistics.median(f4_values) if f4_values else np.nan
|
155
|
+
|
156
|
+
return {
|
157
|
+
"f1_mean": f1_mean,
|
158
|
+
"f2_mean": f2_mean,
|
159
|
+
"f3_mean": f3_mean,
|
160
|
+
"f4_mean": f4_mean,
|
161
|
+
"f1_median": f1_median,
|
162
|
+
"f2_median": f2_median,
|
163
|
+
"f3_median": f3_median,
|
164
|
+
"f4_median": f4_median,
|
165
|
+
}
|
166
|
+
|
167
|
+
def _extract_speech_features(self, sound):
|
168
|
+
"""Extract speech rate and pause features with lognormal distribution analysis."""
|
169
|
+
silencedb = -25
|
170
|
+
mindip = 2
|
171
|
+
minpause = 0.3
|
172
|
+
originaldur = sound.get_total_duration()
|
173
|
+
|
174
|
+
# Reuse intensity object for multiple calculations
|
175
|
+
intensity = sound.to_intensity(50)
|
176
|
+
max_99_intensity = call(intensity, "Get quantile", 0, 0, 0.99)
|
177
|
+
min_intensity = call(intensity, "Get minimum", 0, 0, "Parabolic")
|
178
|
+
max_intensity = call(intensity, "Get maximum", 0, 0, "Parabolic")
|
179
|
+
|
180
|
+
# Calculate threshold once
|
181
|
+
threshold = max_99_intensity + silencedb
|
182
|
+
threshold2 = max_intensity - max_99_intensity
|
183
|
+
threshold3 = silencedb - threshold2
|
184
|
+
if threshold < min_intensity:
|
185
|
+
threshold = min_intensity
|
186
|
+
|
187
|
+
# Extract silences and calculate pause durations
|
188
|
+
textgrid = call(
|
189
|
+
intensity,
|
190
|
+
"To TextGrid (silences)",
|
191
|
+
threshold3,
|
192
|
+
minpause,
|
193
|
+
0.1,
|
194
|
+
"silent",
|
195
|
+
"sounding",
|
196
|
+
)
|
197
|
+
silencetier = call(textgrid, "Extract tier", 1)
|
198
|
+
silencetable = call(silencetier, "Down to TableOfReal", "sounding")
|
199
|
+
npauses = call(silencetable, "Get number of rows")
|
200
|
+
|
201
|
+
speakingtot = 0
|
202
|
+
pause_durations = []
|
203
|
+
|
204
|
+
# Single loop for speaking time and pause duration calculation
|
205
|
+
for ipause in range(npauses):
|
206
|
+
pause = ipause + 1
|
207
|
+
beginsound = call(silencetable, "Get value", pause, 1)
|
208
|
+
endsound = call(silencetable, "Get value", pause, 2)
|
209
|
+
speakingdur = endsound - beginsound
|
210
|
+
speakingtot += speakingdur
|
211
|
+
|
212
|
+
if ipause > 0:
|
213
|
+
prev_endsound = call(silencetable, "Get value", ipause, 2)
|
214
|
+
pause_duration = beginsound - prev_endsound
|
215
|
+
if pause_duration > 0:
|
216
|
+
pause_durations.append(pause_duration)
|
217
|
+
|
218
|
+
# Calculate pause distribution features
|
219
|
+
pause_features = self._calculate_pause_distribution(pause_durations)
|
220
|
+
|
221
|
+
# Efficient syllable counting
|
222
|
+
syllable_features = self._count_syllables_optimized(
|
223
|
+
sound, intensity, textgrid, threshold, mindip, originaldur
|
224
|
+
)
|
225
|
+
|
226
|
+
pausetot = originaldur - speakingtot
|
227
|
+
proportion_pause_duration = pausetot / speakingtot if speakingtot > 0 else 0
|
228
|
+
|
229
|
+
return {
|
230
|
+
**pause_features,
|
231
|
+
**syllable_features,
|
232
|
+
"proportion_pause_duration": proportion_pause_duration,
|
233
|
+
}
|
234
|
+
|
235
|
+
def _calculate_pause_distribution(self, pause_durations):
|
236
|
+
"""Calculate lognormal distribution parameters for pause durations."""
|
237
|
+
pause_lognorm_mu = np.nan
|
238
|
+
pause_lognorm_sigma = np.nan
|
239
|
+
pause_lognorm_ks_pvalue = np.nan
|
240
|
+
pause_mean_duration = np.nan
|
241
|
+
pause_std_duration = np.nan
|
242
|
+
pause_cv = np.nan
|
243
|
+
|
244
|
+
if len(pause_durations) >= 3:
|
245
|
+
try:
|
246
|
+
pause_durations_array = np.array(pause_durations)
|
247
|
+
pause_mean_duration = np.mean(pause_durations_array)
|
248
|
+
pause_std_duration = np.std(pause_durations_array)
|
249
|
+
pause_cv = (
|
250
|
+
pause_std_duration / pause_mean_duration
|
251
|
+
if pause_mean_duration > 0
|
252
|
+
else 0
|
253
|
+
)
|
254
|
+
|
255
|
+
shape, loc, scale = lognorm.fit(pause_durations_array, floc=0)
|
256
|
+
pause_lognorm_sigma = shape
|
257
|
+
pause_lognorm_mu = np.log(scale)
|
258
|
+
|
259
|
+
ks_stat, pause_lognorm_ks_pvalue = stats.kstest(
|
260
|
+
pause_durations_array,
|
261
|
+
lambda x: lognorm.cdf(x, shape, loc=loc, scale=scale),
|
262
|
+
)
|
263
|
+
except (ValueError, RuntimeError) as e:
|
264
|
+
print(f"Error fitting lognormal distribution: {e}")
|
265
|
+
|
266
|
+
return {
|
267
|
+
"pause_lognorm_mu": pause_lognorm_mu,
|
268
|
+
"pause_lognorm_sigma": pause_lognorm_sigma,
|
269
|
+
"pause_lognorm_ks_pvalue": pause_lognorm_ks_pvalue,
|
270
|
+
"pause_mean_duration": pause_mean_duration,
|
271
|
+
"pause_std_duration": pause_std_duration,
|
272
|
+
"pause_cv": pause_cv,
|
273
|
+
}
|
274
|
+
|
275
|
+
def _count_syllables_optimized(
|
276
|
+
self, sound, intensity, textgrid, threshold, mindip, originaldur
|
277
|
+
):
|
278
|
+
"""Optimized syllable counting avoiding redundant matrix operations."""
|
279
|
+
intensity_matrix = call(intensity, "Down to Matrix")
|
280
|
+
sound_from_intensity_matrix = call(intensity_matrix, "To Sound (slice)", 1)
|
281
|
+
intensity_duration = call(sound_from_intensity_matrix, "Get total duration")
|
282
|
+
|
283
|
+
point_process = call(
|
284
|
+
sound_from_intensity_matrix,
|
285
|
+
"To PointProcess (extrema)",
|
286
|
+
"Left",
|
287
|
+
"yes",
|
288
|
+
"no",
|
289
|
+
"Sinc70",
|
290
|
+
)
|
291
|
+
numpeaks = call(point_process, "Get number of points")
|
292
|
+
|
293
|
+
# Vectorized time extraction
|
294
|
+
timepeaks = []
|
295
|
+
intensities = []
|
296
|
+
|
297
|
+
for i in range(numpeaks):
|
298
|
+
t = call(point_process, "Get time from index", i + 1)
|
299
|
+
value = call(sound_from_intensity_matrix, "Get value at time", t, "Cubic")
|
300
|
+
if value > threshold:
|
301
|
+
timepeaks.append(t)
|
302
|
+
intensities.append(value)
|
303
|
+
|
304
|
+
# Optimized peak validation
|
305
|
+
validtime = []
|
306
|
+
if len(timepeaks) > 1:
|
307
|
+
for p in range(len(timepeaks) - 1):
|
308
|
+
currenttime = timepeaks[p]
|
309
|
+
currentint = intensities[p]
|
310
|
+
dip = call(
|
311
|
+
intensity, "Get minimum", currenttime, timepeaks[p + 1], "None"
|
312
|
+
)
|
313
|
+
if abs(currentint - dip) > mindip:
|
314
|
+
validtime.append(timepeaks[p])
|
315
|
+
|
316
|
+
# Count voiced syllables
|
317
|
+
pitch = sound.to_pitch_ac(0.02, 30, 4, False, 0.03, 0.25, 0.01, 0.35, 0.25, 450)
|
318
|
+
voicedcount = 0
|
319
|
+
|
320
|
+
for querytime in validtime:
|
321
|
+
whichinterval = call(textgrid, "Get interval at time", 1, querytime)
|
322
|
+
whichlabel = call(textgrid, "Get label of interval", 1, whichinterval)
|
323
|
+
pitch_value = pitch.get_value_at_time(querytime)
|
324
|
+
if not math.isnan(pitch_value) and whichlabel == "sounding":
|
325
|
+
voicedcount += 1
|
326
|
+
|
327
|
+
# Get silencetable for speaking time calculation
|
328
|
+
silencetier = call(textgrid, "Extract tier", 1)
|
329
|
+
silencetable = call(silencetier, "Down to TableOfReal", "sounding")
|
330
|
+
npauses = call(silencetable, "Get number of rows")
|
331
|
+
|
332
|
+
# Calculate speaking time
|
333
|
+
speakingtot = 0
|
334
|
+
for i in range(npauses):
|
335
|
+
beginsound = call(silencetable, "Get value", i + 1, 1)
|
336
|
+
endsound = call(silencetable, "Get value", i + 1, 2)
|
337
|
+
speakingtot += endsound - beginsound
|
338
|
+
|
339
|
+
# Calculate rates
|
340
|
+
speakingrate = voicedcount / originaldur
|
341
|
+
articulationrate = voicedcount / speakingtot if speakingtot > 0 else 0
|
342
|
+
asd = speakingtot / voicedcount if voicedcount > 0 else 0
|
343
|
+
|
344
|
+
return {
|
345
|
+
"nsyll": voicedcount,
|
346
|
+
"npause": npauses - 1,
|
347
|
+
"phonationtime_s": intensity_duration,
|
348
|
+
"speechrate_nsyll_dur": speakingrate,
|
349
|
+
"articulation_rate_nsyll_phonationtime": articulationrate,
|
350
|
+
"ASD_speakingtime_nsyll": asd,
|
351
|
+
}
|
352
|
+
|
353
|
+
|
354
|
+
# ## This function runs a 2-factor Principle Components Analysis (PCA) on Jitter and Shimmer
|
355
|
+
|
356
|
+
|
357
|
+
def run_pca(df):
|
358
|
+
# z-score the Jitter and Shimmer measurements
|
359
|
+
measures = [
|
360
|
+
"localJitter",
|
361
|
+
"localabsoluteJitter",
|
362
|
+
"rapJitter",
|
363
|
+
"ppq5Jitter",
|
364
|
+
"ddpJitter",
|
365
|
+
"localShimmer",
|
366
|
+
"localdbShimmer",
|
367
|
+
"apq3Shimmer",
|
368
|
+
"apq5Shimmer",
|
369
|
+
"apq11Shimmer",
|
370
|
+
"ddaShimmer",
|
371
|
+
]
|
372
|
+
x = df.loc[:, measures].values
|
373
|
+
# f = open('x.pickle', 'wb')
|
374
|
+
# pickle.dump(x, f)
|
375
|
+
# f.close()
|
376
|
+
|
377
|
+
# x = StandardScaler().fit_transform(x)
|
378
|
+
if np.any(np.isnan(x[0])):
|
379
|
+
print(
|
380
|
+
f"Warning: {np.count_nonzero(np.isnan(x))} Nans in x, replacing" " with 0"
|
381
|
+
)
|
382
|
+
x[np.isnan(x)] = 0
|
383
|
+
# if np.any(np.isfinite(x[0])):
|
384
|
+
# print(f"Warning: {np.count_nonzero(np.isfinite(x))} finite in x")
|
385
|
+
|
386
|
+
# PCA
|
387
|
+
pca = PCA(n_components=2)
|
388
|
+
try:
|
389
|
+
principal_components = pca.fit_transform(x)
|
390
|
+
if np.any(np.isnan(principal_components)):
|
391
|
+
print("pc is nan")
|
392
|
+
print(f"count: {np.count_nonzero(np.isnan(principal_components))}")
|
393
|
+
print(principal_components)
|
394
|
+
principal_components = np.nan_to_num(principal_components)
|
395
|
+
except ValueError:
|
396
|
+
print("need more than one file for pca")
|
397
|
+
principal_components = [[0, 0]]
|
398
|
+
principal_df = pd.DataFrame(
|
399
|
+
data=principal_components, columns=["JitterPCA", "ShimmerPCA"]
|
400
|
+
)
|
401
|
+
return principal_df
|
402
|
+
|
403
|
+
|
404
|
+
# ## This block of code runs the above functions on all of the '.wav' files in the /audio folder
|
405
|
+
|
406
|
+
|
407
|
+
def compute_features(file_index):
|
408
|
+
"""Optimized feature computation using AudioFeatureExtractor class.
|
409
|
+
|
410
|
+
FEATURE COUNT COMPARISON:
|
411
|
+
Original version: ~36 features
|
412
|
+
- Basic: duration, meanF0Hz, stdevF0Hz, HNR (4)
|
413
|
+
- Jitter: localJitter, localabsoluteJitter, rapJitter, ppq5Jitter, ddpJitter (5)
|
414
|
+
- Shimmer: localShimmer, localdbShimmer, apq3Shimmer, apq5Shimmer, apq11Shimmer, ddaShimmer (6)
|
415
|
+
- Formants: f1-f4 mean/median (8)
|
416
|
+
- PCA: JitterPCA, ShimmerPCA (2)
|
417
|
+
- VTL: pF, fdisp, avgFormant, mff, fitch_vtl, delta_f, vtl_delta_f (7)
|
418
|
+
- Speech rate: nsyll, npause, phonationtime_s, speechrate_nsyll_dur,
|
419
|
+
articulation_rate_nsyll_phonationtime, ASD_speakingtime_nsyll (6)
|
420
|
+
|
421
|
+
Current optimized version: ~42 features (+6 new pause distribution features)
|
422
|
+
- All original 36 features PLUS:
|
423
|
+
- Pause distribution: pause_lognorm_mu, pause_lognorm_sigma, pause_lognorm_ks_pvalue,
|
424
|
+
pause_mean_duration, pause_std_duration, pause_cv (6)
|
425
|
+
- Additional: proportion_pause_duration (1)
|
426
|
+
|
427
|
+
Total: 43 features (7 new features added for AD detection)
|
428
|
+
"""
|
429
|
+
extractor = AudioFeatureExtractor()
|
430
|
+
feature_list = []
|
431
|
+
|
432
|
+
for idx, (wave_file, start, end) in enumerate(tqdm(file_index.to_list())):
|
433
|
+
try:
|
434
|
+
signal, sampling_rate = audiofile.read(
|
435
|
+
wave_file,
|
436
|
+
offset=start.total_seconds(),
|
437
|
+
duration=(end - start).total_seconds(),
|
438
|
+
always_2d=True,
|
439
|
+
)
|
440
|
+
sound = parselmouth.Sound(values=signal, sampling_frequency=sampling_rate)
|
441
|
+
|
442
|
+
# Extract all features in one pass
|
443
|
+
features = extractor.extract_all_features(sound)
|
444
|
+
feature_list.append(features)
|
445
|
+
|
446
|
+
except Exception as errors:
|
447
|
+
print(f"error on file {wave_file}: {errors}")
|
448
|
+
# Add empty feature dict for failed files
|
449
|
+
feature_list.append(
|
450
|
+
{
|
451
|
+
key: np.nan
|
452
|
+
for key in ["duration", "meanF0Hz", "stdevF0Hz", "HNR"]
|
453
|
+
+ [
|
454
|
+
f"f{i}_{stat}"
|
455
|
+
for i in range(1, 5)
|
456
|
+
for stat in ["mean", "median"]
|
457
|
+
]
|
458
|
+
+ [
|
459
|
+
"localJitter",
|
460
|
+
"localabsoluteJitter",
|
461
|
+
"rapJitter",
|
462
|
+
"ppq5Jitter",
|
463
|
+
"ddpJitter",
|
464
|
+
"localShimmer",
|
465
|
+
"localdbShimmer",
|
466
|
+
"apq3Shimmer",
|
467
|
+
"apq5Shimmer",
|
468
|
+
"apq11Shimmer",
|
469
|
+
"ddaShimmer",
|
470
|
+
]
|
471
|
+
}
|
472
|
+
)
|
473
|
+
|
474
|
+
# Create DataFrame directly from feature list
|
475
|
+
df = pd.DataFrame(feature_list)
|
476
|
+
|
477
|
+
# Add derived features efficiently
|
478
|
+
df = add_derived_features(df)
|
479
|
+
|
480
|
+
print(
|
481
|
+
f"Feature extraction completed. Total features extracted: {len(df.columns) if 'df' in locals() else '~43'}"
|
482
|
+
)
|
483
|
+
return df
|
484
|
+
|
485
|
+
|
486
|
+
def add_derived_features(df):
|
487
|
+
"""Add PCA and vocal tract length features efficiently."""
|
488
|
+
# PCA on jitter/shimmer
|
489
|
+
pca_data = run_pca(df)
|
490
|
+
df = pd.concat([df, pca_data], axis=1)
|
491
|
+
|
492
|
+
# Vectorized vocal tract calculations
|
493
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
494
|
+
df["pF"] = (
|
495
|
+
zscore(df.f1_median)
|
496
|
+
+ zscore(df.f2_median)
|
497
|
+
+ zscore(df.f3_median)
|
498
|
+
+ zscore(df.f4_median)
|
499
|
+
) / 4
|
500
|
+
df["fdisp"] = (df["f4_median"] - df["f1_median"]) / 3
|
501
|
+
df["avgFormant"] = (
|
502
|
+
df["f1_median"] + df["f2_median"] + df["f3_median"] + df["f4_median"]
|
503
|
+
) / 4
|
504
|
+
df["mff"] = (
|
505
|
+
df["f1_median"] * df["f2_median"] * df["f3_median"] * df["f4_median"]
|
506
|
+
) ** 0.25
|
507
|
+
|
508
|
+
# Fitch VTL calculation
|
509
|
+
df["fitch_vtl"] = (
|
510
|
+
(1 * (35000 / (4 * df["f1_median"])))
|
511
|
+
+ (3 * (35000 / (4 * df["f2_median"])))
|
512
|
+
+ (5 * (35000 / (4 * df["f3_median"])))
|
513
|
+
+ (7 * (35000 / (4 * df["f4_median"])))
|
514
|
+
) / 4
|
515
|
+
|
516
|
+
# Delta F calculation
|
517
|
+
xysum = (
|
518
|
+
0.5 * df["f1_median"]
|
519
|
+
+ 1.5 * df["f2_median"]
|
520
|
+
+ 2.5 * df["f3_median"]
|
521
|
+
+ 3.5 * df["f4_median"]
|
522
|
+
)
|
523
|
+
xsquaredsum = 0.5**2 + 1.5**2 + 2.5**2 + 3.5**2
|
524
|
+
df["delta_f"] = xysum / xsquaredsum
|
525
|
+
df["vtl_delta_f"] = 35000 / (2 * df["delta_f"])
|
526
|
+
|
527
|
+
return df
|
528
|
+
|
529
|
+
|
530
|
+
"""
|
531
|
+
Speech rate script taken from https://github.com/drfeinberg/PraatScripts
|
532
|
+
on 25/05/23
|
533
|
+
"""
|
534
|
+
|
535
|
+
|
536
|
+
def get_speech_rate(file_index):
|
537
|
+
cols = [
|
538
|
+
"nsyll",
|
539
|
+
"npause",
|
540
|
+
"phonationtime_s",
|
541
|
+
"speechrate_nsyll_dur",
|
542
|
+
"articulation_rate_nsyll_phonationtime",
|
543
|
+
"ASD_speakingtime_nsyll",
|
544
|
+
"pause_lognorm_mu",
|
545
|
+
"pause_lognorm_sigma",
|
546
|
+
"pause_lognorm_ks_pvalue",
|
547
|
+
"pause_mean_duration",
|
548
|
+
"pause_std_duration",
|
549
|
+
"pause_cv",
|
550
|
+
]
|
551
|
+
datalist = []
|
552
|
+
for idx, (wave_file, start, end) in enumerate(tqdm(file_index.to_list())):
|
553
|
+
signal, sampling_rate = audiofile.read(
|
554
|
+
wave_file,
|
555
|
+
offset=start.total_seconds(),
|
556
|
+
duration=(end - start).total_seconds(),
|
557
|
+
always_2d=True,
|
558
|
+
)
|
559
|
+
try:
|
560
|
+
sound = parselmouth.Sound(values=signal, sampling_frequency=sampling_rate)
|
561
|
+
# print(f'processing {file}')
|
562
|
+
speechrate_dictionary = speech_rate(sound)
|
563
|
+
datalist.append(speechrate_dictionary)
|
564
|
+
except IndexError as ie:
|
565
|
+
print(f"error extracting speech-rate on file {wave_file}: {ie}")
|
566
|
+
except parselmouth.PraatError as pe:
|
567
|
+
print(f"error extracting speech-rate on file {wave_file}: {pe}")
|
568
|
+
df = pd.DataFrame(datalist)
|
569
|
+
return df
|
570
|
+
|
571
|
+
|
572
|
+
def speech_rate(sound):
|
573
|
+
silencedb = -25
|
574
|
+
mindip = 2
|
575
|
+
minpause = 0.3
|
576
|
+
originaldur = sound.get_total_duration()
|
577
|
+
intensity = sound.to_intensity(50)
|
578
|
+
start = call(intensity, "Get time from frame number", 1)
|
579
|
+
nframes = call(intensity, "Get number of frames")
|
580
|
+
end = call(intensity, "Get time from frame number", nframes)
|
581
|
+
min_intensity = call(intensity, "Get minimum", 0, 0, "Parabolic")
|
582
|
+
max_intensity = call(intensity, "Get maximum", 0, 0, "Parabolic")
|
583
|
+
|
584
|
+
# get .99 quantile to get maximum (without influence of non-speech sound bursts)
|
585
|
+
max_99_intensity = call(intensity, "Get quantile", 0, 0, 0.99)
|
586
|
+
|
587
|
+
# estimate Intensity threshold
|
588
|
+
threshold = max_99_intensity + silencedb
|
589
|
+
threshold2 = max_intensity - max_99_intensity
|
590
|
+
threshold3 = silencedb - threshold2
|
591
|
+
if threshold < min_intensity:
|
592
|
+
threshold = min_intensity
|
593
|
+
|
594
|
+
# get pauses (silences) and speakingtime
|
595
|
+
textgrid = call(
|
596
|
+
intensity,
|
597
|
+
"To TextGrid (silences)",
|
598
|
+
threshold3,
|
599
|
+
minpause,
|
600
|
+
0.1,
|
601
|
+
"silent",
|
602
|
+
"sounding",
|
603
|
+
)
|
604
|
+
silencetier = call(textgrid, "Extract tier", 1)
|
605
|
+
silencetable = call(silencetier, "Down to TableOfReal", "sounding")
|
606
|
+
npauses = call(silencetable, "Get number of rows")
|
607
|
+
speakingtot = 0
|
608
|
+
pause_durations = [] # Store individual pause durations
|
609
|
+
|
610
|
+
for ipause in range(npauses):
|
611
|
+
pause = ipause + 1
|
612
|
+
beginsound = call(silencetable, "Get value", pause, 1)
|
613
|
+
endsound = call(silencetable, "Get value", pause, 2)
|
614
|
+
speakingdur = endsound - beginsound
|
615
|
+
speakingtot += speakingdur
|
616
|
+
|
617
|
+
# Calculate pause duration (time between speaking segments)
|
618
|
+
if ipause > 0:
|
619
|
+
prev_pause = ipause
|
620
|
+
prev_endsound = call(silencetable, "Get value", prev_pause, 2)
|
621
|
+
pause_duration = beginsound - prev_endsound
|
622
|
+
if pause_duration > 0: # Only include positive pause durations
|
623
|
+
pause_durations.append(pause_duration)
|
624
|
+
|
625
|
+
# Calculate pause duration distribution parameters
|
626
|
+
pause_lognorm_mu = np.nan
|
627
|
+
pause_lognorm_sigma = np.nan
|
628
|
+
pause_lognorm_ks_pvalue = np.nan
|
629
|
+
pause_mean_duration = np.nan
|
630
|
+
pause_std_duration = np.nan
|
631
|
+
pause_cv = np.nan
|
632
|
+
|
633
|
+
if len(pause_durations) >= 3: # Need minimum samples for distribution fitting
|
634
|
+
try:
|
635
|
+
# Fit lognormal distribution to pause durations
|
636
|
+
pause_durations_array = np.array(pause_durations)
|
637
|
+
|
638
|
+
# Calculate basic statistics
|
639
|
+
pause_mean_duration = np.mean(pause_durations_array)
|
640
|
+
pause_std_duration = np.std(pause_durations_array)
|
641
|
+
pause_cv = (
|
642
|
+
pause_std_duration / pause_mean_duration
|
643
|
+
if pause_mean_duration > 0
|
644
|
+
else 0
|
645
|
+
)
|
646
|
+
|
647
|
+
# Fit lognormal distribution
|
648
|
+
shape, loc, scale = lognorm.fit(pause_durations_array, floc=0)
|
649
|
+
pause_lognorm_sigma = shape # shape parameter (sigma)
|
650
|
+
pause_lognorm_mu = np.log(scale) # location parameter (mu)
|
651
|
+
|
652
|
+
# Test goodness of fit using Kolmogorov-Smirnov test
|
653
|
+
ks_stat, pause_lognorm_ks_pvalue = stats.kstest(
|
654
|
+
pause_durations_array,
|
655
|
+
lambda x: lognorm.cdf(x, shape, loc=loc, scale=scale),
|
656
|
+
)
|
657
|
+
|
658
|
+
except (ValueError, RuntimeError) as e:
|
659
|
+
print(f"Error fitting lognormal distribution to pause durations: {e}")
|
660
|
+
|
661
|
+
# Calculate pause duration
|
662
|
+
pausetot = originaldur - speakingtot
|
663
|
+
|
664
|
+
intensity_matrix = call(intensity, "Down to Matrix")
|
665
|
+
# sndintid = sound_from_intensity_matrix
|
666
|
+
sound_from_intensity_matrix = call(intensity_matrix, "To Sound (slice)", 1)
|
667
|
+
# use total duration, not end time, to find out duration of intdur (intensity_duration)
|
668
|
+
# in order to allow nonzero starting times.
|
669
|
+
intensity_duration = call(sound_from_intensity_matrix, "Get total duration")
|
670
|
+
intensity_max = call(sound_from_intensity_matrix, "Get maximum", 0, 0, "Parabolic")
|
671
|
+
point_process = call(
|
672
|
+
sound_from_intensity_matrix,
|
673
|
+
"To PointProcess (extrema)",
|
674
|
+
"Left",
|
675
|
+
"yes",
|
676
|
+
"no",
|
677
|
+
"Sinc70",
|
678
|
+
)
|
679
|
+
# estimate peak positions (all peaks)
|
680
|
+
numpeaks = call(point_process, "Get number of points")
|
681
|
+
t = [call(point_process, "Get time from index", i + 1) for i in range(numpeaks)]
|
682
|
+
|
683
|
+
# fill array with intensity values
|
684
|
+
timepeaks = []
|
685
|
+
peakcount = 0
|
686
|
+
intensities = []
|
687
|
+
for i in range(numpeaks):
|
688
|
+
value = call(sound_from_intensity_matrix, "Get value at time", t[i], "Cubic")
|
689
|
+
if value > threshold:
|
690
|
+
peakcount += 1
|
691
|
+
intensities.append(value)
|
692
|
+
timepeaks.append(t[i])
|
693
|
+
|
694
|
+
# fill array with valid peaks: only intensity values if preceding
|
695
|
+
# dip in intensity is greater than mindip
|
696
|
+
validpeakcount = 0
|
697
|
+
currenttime = timepeaks[0] if timepeaks else 0
|
698
|
+
currentint = intensities[0] if intensities else 0
|
699
|
+
validtime = []
|
700
|
+
|
701
|
+
for p in range(peakcount - 1):
|
702
|
+
following = p + 1
|
703
|
+
followingtime = timepeaks[p + 1]
|
704
|
+
dip = call(intensity, "Get minimum", currenttime, timepeaks[p + 1], "None")
|
705
|
+
diffint = abs(currentint - dip)
|
706
|
+
if diffint > mindip:
|
707
|
+
validpeakcount += 1
|
708
|
+
validtime.append(timepeaks[p])
|
709
|
+
currenttime = timepeaks[following]
|
710
|
+
currentint = call(intensity, "Get value at time", timepeaks[following], "Cubic")
|
711
|
+
|
712
|
+
# Look for only voiced parts
|
713
|
+
pitch = sound.to_pitch_ac(0.02, 30, 4, False, 0.03, 0.25, 0.01, 0.35, 0.25, 450)
|
714
|
+
voicedcount = 0
|
715
|
+
voicedpeak = []
|
716
|
+
|
717
|
+
for time in range(validpeakcount):
|
718
|
+
querytime = validtime[time]
|
719
|
+
whichinterval = call(textgrid, "Get interval at time", 1, querytime)
|
720
|
+
whichlabel = call(textgrid, "Get label of interval", 1, whichinterval)
|
721
|
+
value = pitch.get_value_at_time(querytime)
|
722
|
+
if not math.isnan(value):
|
723
|
+
if whichlabel == "sounding":
|
724
|
+
voicedcount += 1
|
725
|
+
voicedpeak.append(validtime[time])
|
726
|
+
|
727
|
+
# calculate time correction due to shift in time for Sound object versus
|
728
|
+
# intensity object
|
729
|
+
timecorrection = originaldur / intensity_duration
|
730
|
+
|
731
|
+
# Insert voiced peaks in TextGrid
|
732
|
+
call(textgrid, "Insert point tier", 1, "syllables")
|
733
|
+
for i in range(len(voicedpeak)):
|
734
|
+
position = voicedpeak[i] * timecorrection
|
735
|
+
call(textgrid, "Insert point", 1, position, "")
|
736
|
+
|
737
|
+
# return results
|
738
|
+
speakingrate = voicedcount / originaldur
|
739
|
+
articulationrate = voicedcount / speakingtot if speakingtot > 0 else 0
|
740
|
+
npause = npauses - 1
|
741
|
+
try:
|
742
|
+
asd = speakingtot / voicedcount
|
743
|
+
except ZeroDivisionError:
|
744
|
+
asd = 0
|
745
|
+
print("caught zero division")
|
746
|
+
|
747
|
+
# Calculate proportion pause duration
|
748
|
+
try:
|
749
|
+
proportion_pause_duration = pausetot / speakingtot
|
750
|
+
except ZeroDivisionError:
|
751
|
+
proportion_pause_duration = 0
|
752
|
+
print("caught zero division for proportion pause duration")
|
753
|
+
|
754
|
+
speechrate_dictionary = {
|
755
|
+
"nsyll": voicedcount,
|
756
|
+
"npause": npause,
|
757
|
+
"phonationtime_s": intensity_duration,
|
758
|
+
"speechrate_nsyll_dur": speakingrate,
|
759
|
+
"articulation_rate_nsyll_phonationtime": articulationrate,
|
760
|
+
"ASD_speakingtime_nsyll": asd,
|
761
|
+
"proportion_pause_duration": proportion_pause_duration,
|
762
|
+
"pause_lognorm_mu": pause_lognorm_mu,
|
763
|
+
"pause_lognorm_sigma": pause_lognorm_sigma,
|
764
|
+
"pause_lognorm_ks_pvalue": pause_lognorm_ks_pvalue,
|
765
|
+
"pause_mean_duration": pause_mean_duration,
|
766
|
+
"pause_std_duration": pause_std_duration,
|
767
|
+
"pause_cv": pause_cv,
|
768
|
+
}
|
769
|
+
return speechrate_dictionary
|