nkululeko 0.94.2__py3-none-any.whl → 0.95.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,8 @@
1
1
  """This is a copy of David R. Feinberg's Praat scripts.
2
2
  https://github.com/drfeinberg/PraatScripts
3
3
  taken June 23rd 2022.
4
+
5
+ 2025-05-06: Optimized for faster computation (bta).
4
6
  """
5
7
 
6
8
  #!/usr/bin/env python3
@@ -13,164 +15,340 @@ import pandas as pd
13
15
  import parselmouth
14
16
  from parselmouth.praat import call
15
17
  from scipy.stats.mstats import zscore
18
+ from scipy.stats import lognorm
19
+ from scipy import stats
16
20
  from sklearn.decomposition import PCA
17
21
  from tqdm import tqdm
18
22
 
19
- # This is the function to measure source acoustics using default male parameters.
20
-
21
-
22
- def measure_pitch(voice_id, f0min, f0max, unit):
23
- sound = parselmouth.Sound(voice_id) # read the sound
24
- duration = call(sound, "Get total duration") # duration
25
- pitch = call(sound, "To Pitch", 0.0, f0min, f0max) # create a praat pitch object
26
- mean_f0 = call(pitch, "Get mean", 0, 0, unit) # get mean pitch
27
- stdev_f0 = call(
28
- pitch, "Get standard deviation", 0, 0, unit
29
- ) # get standard deviation
30
- harmonicity = call(sound, "To Harmonicity (cc)", 0.01, f0min, 0.1, 1.0)
31
- hnr = call(harmonicity, "Get mean", 0, 0)
32
- point_process = call(sound, "To PointProcess (periodic, cc)", f0min, f0max)
33
- local_jitter = call(point_process, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3)
34
- localabsolute_jitter = call(
35
- point_process, "Get jitter (local, absolute)", 0, 0, 0.0001, 0.02, 1.3
36
- )
37
- rap_jitter = call(point_process, "Get jitter (rap)", 0, 0, 0.0001, 0.02, 1.3)
38
- ppq5_jitter = call(point_process, "Get jitter (ppq5)", 0, 0, 0.0001, 0.02, 1.3)
39
- ddp_jitter = call(point_process, "Get jitter (ddp)", 0, 0, 0.0001, 0.02, 1.3)
40
- local_shimmer = call(
41
- [sound, point_process],
42
- "Get shimmer (local)",
43
- 0,
44
- 0,
45
- 0.0001,
46
- 0.02,
47
- 1.3,
48
- 1.6,
49
- )
50
- localdb_shimmer = call(
51
- [sound, point_process],
52
- "Get shimmer (local_dB)",
53
- 0,
54
- 0,
55
- 0.0001,
56
- 0.02,
57
- 1.3,
58
- 1.6,
59
- )
60
- apq3_shimmer = call(
61
- [sound, point_process],
62
- "Get shimmer (apq3)",
63
- 0,
64
- 0,
65
- 0.0001,
66
- 0.02,
67
- 1.3,
68
- 1.6,
69
- )
70
- aqpq5_shimmer = call(
71
- [sound, point_process],
72
- "Get shimmer (apq5)",
73
- 0,
74
- 0,
75
- 0.0001,
76
- 0.02,
77
- 1.3,
78
- 1.6,
79
- )
80
- apq11_shimmer = call(
81
- [sound, point_process],
82
- "Get shimmer (apq11)",
83
- 0,
84
- 0,
85
- 0.0001,
86
- 0.02,
87
- 1.3,
88
- 1.6,
89
- )
90
- dda_shimmer = call(
91
- [sound, point_process], "Get shimmer (dda)", 0, 0, 0.0001, 0.02, 1.3, 1.6
92
- )
93
23
 
94
- return (
95
- duration,
96
- mean_f0,
97
- stdev_f0,
98
- hnr,
99
- local_jitter,
100
- localabsolute_jitter,
101
- rap_jitter,
102
- ppq5_jitter,
103
- ddp_jitter,
104
- local_shimmer,
105
- localdb_shimmer,
106
- apq3_shimmer,
107
- aqpq5_shimmer,
108
- apq11_shimmer,
109
- dda_shimmer,
110
- )
24
+ class AudioFeatureExtractor:
25
+ """Optimized audio feature extraction class to avoid redundant calculations."""
111
26
 
27
+ def __init__(self, f0min=75, f0max=300):
28
+ self.f0min = f0min
29
+ self.f0max = f0max
112
30
 
113
- # ## This function measures formants at each glottal pulse
114
- #
115
- # Puts, D. A., Apicella, C. L., & Cárdenas, R. A. (2012). Masculine voices signal men's threat potential in forager and industrial societies. Proceedings of the Royal Society of London B: Biological Sciences, 279(1728), 601-609.
116
- #
117
- # Adapted from: DOI 10.17605/OSF.IO/K2BHS
118
- # This function measures formants using Formant Position formula
119
- # def measureFormants(sound, wave_file, f0min,f0max):
120
- def measure_formants(sound, f0min, f0max):
121
- sound = parselmouth.Sound(sound) # read the sound
122
- # pitch = call(sound, "To Pitch (cc)", 0, f0min, 15, 'no', 0.03, 0.45, 0.01, 0.35, 0.14, f0max)
123
- point_process = call(sound, "To PointProcess (periodic, cc)", f0min, f0max)
124
-
125
- formants = call(sound, "To Formant (burg)", 0.0025, 5, 5000, 0.025, 50)
126
- num_points = call(point_process, "Get number of points")
127
-
128
- f1_list = []
129
- f2_list = []
130
- f3_list = []
131
- f4_list = []
132
-
133
- # Measure formants only at glottal pulses
134
- for point in range(0, num_points):
135
- point += 1
136
- t = call(point_process, "Get time from index", point)
137
- f1 = call(formants, "Get value at time", 1, t, "Hertz", "Linear")
138
- f2 = call(formants, "Get value at time", 2, t, "Hertz", "Linear")
139
- f3 = call(formants, "Get value at time", 3, t, "Hertz", "Linear")
140
- f4 = call(formants, "Get value at time", 4, t, "Hertz", "Linear")
141
- f1_list.append(f1)
142
- f2_list.append(f2)
143
- f3_list.append(f3)
144
- f4_list.append(f4)
145
-
146
- f1_list = [f1 for f1 in f1_list if str(f1) != "nan"]
147
- f2_list = [f2 for f2 in f2_list if str(f2) != "nan"]
148
- f3_list = [f3 for f3 in f3_list if str(f3) != "nan"]
149
- f4_list = [f4 for f4 in f4_list if str(f4) != "nan"]
150
-
151
- # calculate mean formants across pulses
152
- f1_mean = statistics.mean(f1_list)
153
- f2_mean = statistics.mean(f2_list)
154
- f3_mean = statistics.mean(f3_list)
155
- f4_mean = statistics.mean(f4_list)
156
-
157
- # calculate median formants across pulses, this is what is used in all subsequent calcualtions
158
- # you can use mean if you want, just edit the code in the boxes below to replace median with mean
159
- f1_median = statistics.median(f1_list)
160
- f2_median = statistics.median(f2_list)
161
- f3_median = statistics.median(f3_list)
162
- f4_median = statistics.median(f4_list)
163
-
164
- return (
165
- f1_mean,
166
- f2_mean,
167
- f3_mean,
168
- f4_mean,
169
- f1_median,
170
- f2_median,
171
- f3_median,
172
- f4_median,
173
- )
31
+ def extract_all_features(self, sound):
32
+ """Extract all acoustic features from a single sound object."""
33
+ # Cache common objects to avoid redundant calculations
34
+ duration = sound.get_total_duration()
35
+ pitch = call(sound, "To Pitch", 0.0, self.f0min, self.f0max)
36
+ point_process = call(
37
+ sound, "To PointProcess (periodic, cc)", self.f0min, self.f0max
38
+ )
39
+
40
+ # Extract pitch-related features
41
+ pitch_features = self._extract_pitch_features(sound, pitch, point_process)
42
+
43
+ # Extract formant features
44
+ formant_features = self._extract_formant_features(sound, point_process)
45
+
46
+ # Extract speech rate and pause features
47
+ speech_features = self._extract_speech_features(sound)
48
+
49
+ # Combine all features
50
+ all_features = {
51
+ "duration": duration,
52
+ **pitch_features,
53
+ **formant_features,
54
+ **speech_features,
55
+ }
56
+
57
+ return all_features
58
+
59
+ def _extract_pitch_features(self, sound, pitch, point_process):
60
+ """Extract pitch, jitter, shimmer, and HNR features."""
61
+ # Pitch statistics
62
+ mean_f0 = call(pitch, "Get mean", 0, 0, "Hertz")
63
+ stdev_f0 = call(pitch, "Get standard deviation", 0, 0, "Hertz")
64
+
65
+ # HNR
66
+ harmonicity = call(sound, "To Harmonicity (cc)", 0.01, self.f0min, 0.1, 1.0)
67
+ hnr = call(harmonicity, "Get mean", 0, 0)
68
+
69
+ # Jitter measures
70
+ local_jitter = call(
71
+ point_process, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3
72
+ )
73
+ localabsolute_jitter = call(
74
+ point_process, "Get jitter (local, absolute)", 0, 0, 0.0001, 0.02, 1.3
75
+ )
76
+ rap_jitter = call(point_process, "Get jitter (rap)", 0, 0, 0.0001, 0.02, 1.3)
77
+ ppq5_jitter = call(point_process, "Get jitter (ppq5)", 0, 0, 0.0001, 0.02, 1.3)
78
+ ddp_jitter = call(point_process, "Get jitter (ddp)", 0, 0, 0.0001, 0.02, 1.3)
79
+
80
+ # Shimmer measures (reuse point_process)
81
+ shimmer_params = [0, 0, 0.0001, 0.02, 1.3, 1.6]
82
+ local_shimmer = call(
83
+ [sound, point_process], "Get shimmer (local)", *shimmer_params
84
+ )
85
+ localdb_shimmer = call(
86
+ [sound, point_process], "Get shimmer (local_dB)", *shimmer_params
87
+ )
88
+ apq3_shimmer = call(
89
+ [sound, point_process], "Get shimmer (apq3)", *shimmer_params
90
+ )
91
+ apq5_shimmer = call(
92
+ [sound, point_process], "Get shimmer (apq5)", *shimmer_params
93
+ )
94
+ apq11_shimmer = call(
95
+ [sound, point_process], "Get shimmer (apq11)", *shimmer_params
96
+ )
97
+ dda_shimmer = call([sound, point_process], "Get shimmer (dda)", *shimmer_params)
98
+
99
+ return {
100
+ "meanF0Hz": mean_f0,
101
+ "stdevF0Hz": stdev_f0,
102
+ "HNR": hnr,
103
+ "localJitter": local_jitter,
104
+ "localabsoluteJitter": localabsolute_jitter,
105
+ "rapJitter": rap_jitter,
106
+ "ppq5Jitter": ppq5_jitter,
107
+ "ddpJitter": ddp_jitter,
108
+ "localShimmer": local_shimmer,
109
+ "localdbShimmer": localdb_shimmer,
110
+ "apq3Shimmer": apq3_shimmer,
111
+ "apq5Shimmer": apq5_shimmer,
112
+ "apq11Shimmer": apq11_shimmer,
113
+ "ddaShimmer": dda_shimmer,
114
+ }
115
+
116
+ def _extract_formant_features(self, sound, point_process):
117
+ """Extract formant features efficiently."""
118
+ formants = call(sound, "To Formant (burg)", 0.0025, 5, 5000, 0.025, 50)
119
+ num_points = call(point_process, "Get number of points")
120
+
121
+ # Pre-allocate arrays for better performance
122
+ f1_values = []
123
+ f2_values = []
124
+ f3_values = []
125
+ f4_values = []
126
+
127
+ # Single loop to extract all formants
128
+ for point in range(num_points):
129
+ t = call(point_process, "Get time from index", point + 1)
130
+ f1 = call(formants, "Get value at time", 1, t, "Hertz", "Linear")
131
+ f2 = call(formants, "Get value at time", 2, t, "Hertz", "Linear")
132
+ f3 = call(formants, "Get value at time", 3, t, "Hertz", "Linear")
133
+ f4 = call(formants, "Get value at time", 4, t, "Hertz", "Linear")
134
+
135
+ # Filter out NaN values during collection
136
+ if not math.isnan(f1):
137
+ f1_values.append(f1)
138
+ if not math.isnan(f2):
139
+ f2_values.append(f2)
140
+ if not math.isnan(f3):
141
+ f3_values.append(f3)
142
+ if not math.isnan(f4):
143
+ f4_values.append(f4)
144
+
145
+ # Calculate statistics only once
146
+ f1_mean = statistics.mean(f1_values) if f1_values else np.nan
147
+ f2_mean = statistics.mean(f2_values) if f2_values else np.nan
148
+ f3_mean = statistics.mean(f3_values) if f3_values else np.nan
149
+ f4_mean = statistics.mean(f4_values) if f4_values else np.nan
150
+
151
+ f1_median = statistics.median(f1_values) if f1_values else np.nan
152
+ f2_median = statistics.median(f2_values) if f2_values else np.nan
153
+ f3_median = statistics.median(f3_values) if f3_values else np.nan
154
+ f4_median = statistics.median(f4_values) if f4_values else np.nan
155
+
156
+ return {
157
+ "f1_mean": f1_mean,
158
+ "f2_mean": f2_mean,
159
+ "f3_mean": f3_mean,
160
+ "f4_mean": f4_mean,
161
+ "f1_median": f1_median,
162
+ "f2_median": f2_median,
163
+ "f3_median": f3_median,
164
+ "f4_median": f4_median,
165
+ }
166
+
167
+ def _extract_speech_features(self, sound):
168
+ """Extract speech rate and pause features with lognormal distribution analysis."""
169
+ silencedb = -25
170
+ mindip = 2
171
+ minpause = 0.3
172
+ originaldur = sound.get_total_duration()
173
+
174
+ # Reuse intensity object for multiple calculations
175
+ intensity = sound.to_intensity(50)
176
+ max_99_intensity = call(intensity, "Get quantile", 0, 0, 0.99)
177
+ min_intensity = call(intensity, "Get minimum", 0, 0, "Parabolic")
178
+ max_intensity = call(intensity, "Get maximum", 0, 0, "Parabolic")
179
+
180
+ # Calculate threshold once
181
+ threshold = max_99_intensity + silencedb
182
+ threshold2 = max_intensity - max_99_intensity
183
+ threshold3 = silencedb - threshold2
184
+ if threshold < min_intensity:
185
+ threshold = min_intensity
186
+
187
+ # Extract silences and calculate pause durations
188
+ textgrid = call(
189
+ intensity,
190
+ "To TextGrid (silences)",
191
+ threshold3,
192
+ minpause,
193
+ 0.1,
194
+ "silent",
195
+ "sounding",
196
+ )
197
+ silencetier = call(textgrid, "Extract tier", 1)
198
+ silencetable = call(silencetier, "Down to TableOfReal", "sounding")
199
+ npauses = call(silencetable, "Get number of rows")
200
+
201
+ speakingtot = 0
202
+ pause_durations = []
203
+
204
+ # Single loop for speaking time and pause duration calculation
205
+ for ipause in range(npauses):
206
+ pause = ipause + 1
207
+ beginsound = call(silencetable, "Get value", pause, 1)
208
+ endsound = call(silencetable, "Get value", pause, 2)
209
+ speakingdur = endsound - beginsound
210
+ speakingtot += speakingdur
211
+
212
+ if ipause > 0:
213
+ prev_endsound = call(silencetable, "Get value", ipause, 2)
214
+ pause_duration = beginsound - prev_endsound
215
+ if pause_duration > 0:
216
+ pause_durations.append(pause_duration)
217
+
218
+ # Calculate pause distribution features
219
+ pause_features = self._calculate_pause_distribution(pause_durations)
220
+
221
+ # Efficient syllable counting
222
+ syllable_features = self._count_syllables_optimized(
223
+ sound, intensity, textgrid, threshold, mindip, originaldur
224
+ )
225
+
226
+ pausetot = originaldur - speakingtot
227
+ proportion_pause_duration = pausetot / speakingtot if speakingtot > 0 else 0
228
+
229
+ return {
230
+ **pause_features,
231
+ **syllable_features,
232
+ "proportion_pause_duration": proportion_pause_duration,
233
+ }
234
+
235
+ def _calculate_pause_distribution(self, pause_durations):
236
+ """Calculate lognormal distribution parameters for pause durations."""
237
+ pause_lognorm_mu = np.nan
238
+ pause_lognorm_sigma = np.nan
239
+ pause_lognorm_ks_pvalue = np.nan
240
+ pause_mean_duration = np.nan
241
+ pause_std_duration = np.nan
242
+ pause_cv = np.nan
243
+
244
+ if len(pause_durations) >= 3:
245
+ try:
246
+ pause_durations_array = np.array(pause_durations)
247
+ pause_mean_duration = np.mean(pause_durations_array)
248
+ pause_std_duration = np.std(pause_durations_array)
249
+ pause_cv = (
250
+ pause_std_duration / pause_mean_duration
251
+ if pause_mean_duration > 0
252
+ else 0
253
+ )
254
+
255
+ shape, loc, scale = lognorm.fit(pause_durations_array, floc=0)
256
+ pause_lognorm_sigma = shape
257
+ pause_lognorm_mu = np.log(scale)
258
+
259
+ ks_stat, pause_lognorm_ks_pvalue = stats.kstest(
260
+ pause_durations_array,
261
+ lambda x: lognorm.cdf(x, shape, loc=loc, scale=scale),
262
+ )
263
+ except (ValueError, RuntimeError) as e:
264
+ print(f"Error fitting lognormal distribution: {e}")
265
+
266
+ return {
267
+ "pause_lognorm_mu": pause_lognorm_mu,
268
+ "pause_lognorm_sigma": pause_lognorm_sigma,
269
+ "pause_lognorm_ks_pvalue": pause_lognorm_ks_pvalue,
270
+ "pause_mean_duration": pause_mean_duration,
271
+ "pause_std_duration": pause_std_duration,
272
+ "pause_cv": pause_cv,
273
+ }
274
+
275
+ def _count_syllables_optimized(
276
+ self, sound, intensity, textgrid, threshold, mindip, originaldur
277
+ ):
278
+ """Optimized syllable counting avoiding redundant matrix operations."""
279
+ intensity_matrix = call(intensity, "Down to Matrix")
280
+ sound_from_intensity_matrix = call(intensity_matrix, "To Sound (slice)", 1)
281
+ intensity_duration = call(sound_from_intensity_matrix, "Get total duration")
282
+
283
+ point_process = call(
284
+ sound_from_intensity_matrix,
285
+ "To PointProcess (extrema)",
286
+ "Left",
287
+ "yes",
288
+ "no",
289
+ "Sinc70",
290
+ )
291
+ numpeaks = call(point_process, "Get number of points")
292
+
293
+ # Vectorized time extraction
294
+ timepeaks = []
295
+ intensities = []
296
+
297
+ for i in range(numpeaks):
298
+ t = call(point_process, "Get time from index", i + 1)
299
+ value = call(sound_from_intensity_matrix, "Get value at time", t, "Cubic")
300
+ if value > threshold:
301
+ timepeaks.append(t)
302
+ intensities.append(value)
303
+
304
+ # Optimized peak validation
305
+ validtime = []
306
+ if len(timepeaks) > 1:
307
+ for p in range(len(timepeaks) - 1):
308
+ currenttime = timepeaks[p]
309
+ currentint = intensities[p]
310
+ dip = call(
311
+ intensity, "Get minimum", currenttime, timepeaks[p + 1], "None"
312
+ )
313
+ if abs(currentint - dip) > mindip:
314
+ validtime.append(timepeaks[p])
315
+
316
+ # Count voiced syllables
317
+ pitch = sound.to_pitch_ac(0.02, 30, 4, False, 0.03, 0.25, 0.01, 0.35, 0.25, 450)
318
+ voicedcount = 0
319
+
320
+ for querytime in validtime:
321
+ whichinterval = call(textgrid, "Get interval at time", 1, querytime)
322
+ whichlabel = call(textgrid, "Get label of interval", 1, whichinterval)
323
+ pitch_value = pitch.get_value_at_time(querytime)
324
+ if not math.isnan(pitch_value) and whichlabel == "sounding":
325
+ voicedcount += 1
326
+
327
+ # Get silencetable for speaking time calculation
328
+ silencetier = call(textgrid, "Extract tier", 1)
329
+ silencetable = call(silencetier, "Down to TableOfReal", "sounding")
330
+ npauses = call(silencetable, "Get number of rows")
331
+
332
+ # Calculate speaking time
333
+ speakingtot = 0
334
+ for i in range(npauses):
335
+ beginsound = call(silencetable, "Get value", i + 1, 1)
336
+ endsound = call(silencetable, "Get value", i + 1, 2)
337
+ speakingtot += endsound - beginsound
338
+
339
+ # Calculate rates
340
+ speakingrate = voicedcount / originaldur
341
+ articulationrate = voicedcount / speakingtot if speakingtot > 0 else 0
342
+ asd = speakingtot / voicedcount if voicedcount > 0 else 0
343
+
344
+ return {
345
+ "nsyll": voicedcount,
346
+ "npause": npauses - 1,
347
+ "phonationtime_s": intensity_duration,
348
+ "speechrate_nsyll_dur": speakingrate,
349
+ "articulation_rate_nsyll_phonationtime": articulationrate,
350
+ "ASD_speakingtime_nsyll": asd,
351
+ }
174
352
 
175
353
 
176
354
  # ## This function runs a 2-factor Principle Components Analysis (PCA) on Jitter and Shimmer
@@ -227,231 +405,126 @@ def run_pca(df):
227
405
 
228
406
 
229
407
  def compute_features(file_index):
230
- # create lists to put the results
231
- duration_list = []
232
- mean_f0_list = []
233
- sd_f0_list = []
234
- hnr_list = []
235
- local_jitter_list = []
236
- localabsolute_jitter_list = []
237
- rap_jitter_list = []
238
- ppq5_jitter_list = []
239
- ddp_jitter_list = []
240
- local_shimmer_list = []
241
- localdb_shimmer_list = []
242
- apq3_shimmer_list = []
243
- aqpq5_shimmer_list = []
244
- apq11_shimmer_list = []
245
- dda_shimmer_list = []
246
- f1_mean_list = []
247
- f2_mean_list = []
248
- f3_mean_list = []
249
- f4_mean_list = []
250
- f1_median_list = []
251
- f2_median_list = []
252
- f3_median_list = []
253
- f4_median_list = []
254
- # Go through all the wave files in the folder and measure all the acoustics
255
- # for i, wave_file in enumerate(file_list):
408
+ """Optimized feature computation using AudioFeatureExtractor class.
409
+
410
+ FEATURE COUNT COMPARISON:
411
+ Original version: ~36 features
412
+ - Basic: duration, meanF0Hz, stdevF0Hz, HNR (4)
413
+ - Jitter: localJitter, localabsoluteJitter, rapJitter, ppq5Jitter, ddpJitter (5)
414
+ - Shimmer: localShimmer, localdbShimmer, apq3Shimmer, apq5Shimmer, apq11Shimmer, ddaShimmer (6)
415
+ - Formants: f1-f4 mean/median (8)
416
+ - PCA: JitterPCA, ShimmerPCA (2)
417
+ - VTL: pF, fdisp, avgFormant, mff, fitch_vtl, delta_f, vtl_delta_f (7)
418
+ - Speech rate: nsyll, npause, phonationtime_s, speechrate_nsyll_dur,
419
+ articulation_rate_nsyll_phonationtime, ASD_speakingtime_nsyll (6)
420
+
421
+ Current optimized version: ~42 features (+6 new pause distribution features)
422
+ - All original 36 features PLUS:
423
+ - Pause distribution: pause_lognorm_mu, pause_lognorm_sigma, pause_lognorm_ks_pvalue,
424
+ pause_mean_duration, pause_std_duration, pause_cv (6)
425
+ - Additional: proportion_pause_duration (1)
426
+
427
+ Total: 43 features (7 new features added for AD detection)
428
+ """
429
+ extractor = AudioFeatureExtractor()
430
+ feature_list = []
431
+
256
432
  for idx, (wave_file, start, end) in enumerate(tqdm(file_index.to_list())):
257
- signal, sampling_rate = audiofile.read(
258
- wave_file,
259
- offset=start.total_seconds(),
260
- duration=(end - start).total_seconds(),
261
- always_2d=True,
262
- )
263
433
  try:
434
+ signal, sampling_rate = audiofile.read(
435
+ wave_file,
436
+ offset=start.total_seconds(),
437
+ duration=(end - start).total_seconds(),
438
+ always_2d=True,
439
+ )
264
440
  sound = parselmouth.Sound(values=signal, sampling_frequency=sampling_rate)
265
- (
266
- duration,
267
- mean_f0,
268
- stdev_f0,
269
- hnr,
270
- local_jitter,
271
- localabsolute_jitter,
272
- rap_jitter,
273
- ppq5_jitter,
274
- ddp_jitter,
275
- local_shimmer,
276
- localdb_shimmer,
277
- apq3_shimmer,
278
- aqpq5_shimmer,
279
- apq11_shimmer,
280
- dda_shimmer,
281
- ) = measure_pitch(sound, 75, 300, "Hertz")
282
- (
283
- f1_mean,
284
- f2_mean,
285
- f3_mean,
286
- f4_mean,
287
- f1_median,
288
- f2_median,
289
- f3_median,
290
- f4_median,
291
- ) = measure_formants(sound, 75, 300)
292
- # file_list.append(wave_file) # make an ID list
293
- except (statistics.StatisticsError, parselmouth.PraatError) as errors:
294
- print(f"error on file {wave_file}: {errors}")
295
441
 
296
- duration_list.append(duration) # make duration list
297
- mean_f0_list.append(mean_f0) # make a mean F0 list
298
- sd_f0_list.append(stdev_f0) # make a sd F0 list
299
- hnr_list.append(hnr) # add HNR data
300
-
301
- # add raw jitter and shimmer measures
302
- local_jitter_list.append(local_jitter)
303
- localabsolute_jitter_list.append(localabsolute_jitter)
304
- rap_jitter_list.append(rap_jitter)
305
- ppq5_jitter_list.append(ppq5_jitter)
306
- ddp_jitter_list.append(ddp_jitter)
307
- local_shimmer_list.append(local_shimmer)
308
- localdb_shimmer_list.append(localdb_shimmer)
309
- apq3_shimmer_list.append(apq3_shimmer)
310
- aqpq5_shimmer_list.append(aqpq5_shimmer)
311
- apq11_shimmer_list.append(apq11_shimmer)
312
- dda_shimmer_list.append(dda_shimmer)
313
-
314
- # add the formant data
315
- f1_mean_list.append(f1_mean)
316
- f2_mean_list.append(f2_mean)
317
- f3_mean_list.append(f3_mean)
318
- f4_mean_list.append(f4_mean)
319
- f1_median_list.append(f1_median)
320
- f2_median_list.append(f2_median)
321
- f3_median_list.append(f3_median)
322
- f4_median_list.append(f4_median)
323
- # ## This block of code adds all of that data we just generated to a Pandas data frame
324
- # Add the data to Pandas
325
- df = pd.DataFrame(
326
- np.column_stack(
327
- [
328
- duration_list,
329
- mean_f0_list,
330
- sd_f0_list,
331
- hnr_list,
332
- local_jitter_list,
333
- localabsolute_jitter_list,
334
- rap_jitter_list,
335
- ppq5_jitter_list,
336
- ddp_jitter_list,
337
- local_shimmer_list,
338
- localdb_shimmer_list,
339
- apq3_shimmer_list,
340
- aqpq5_shimmer_list,
341
- apq11_shimmer_list,
342
- dda_shimmer_list,
343
- f1_mean_list,
344
- f2_mean_list,
345
- f3_mean_list,
346
- f4_mean_list,
347
- f1_median_list,
348
- f2_median_list,
349
- f3_median_list,
350
- f4_median_list,
351
- ]
352
- ),
353
- columns=[
354
- "duration",
355
- "meanF0Hz",
356
- "stdevF0Hz",
357
- "HNR",
358
- "localJitter",
359
- "localabsoluteJitter",
360
- "rapJitter",
361
- "ppq5Jitter",
362
- "ddpJitter",
363
- "localShimmer",
364
- "localdbShimmer",
365
- "apq3Shimmer",
366
- "apq5Shimmer",
367
- "apq11Shimmer",
368
- "ddaShimmer",
369
- "f1_mean",
370
- "f2_mean",
371
- "f3_mean",
372
- "f4_mean",
373
- "f1_median",
374
- "f2_median",
375
- "f3_median",
376
- "f4_median",
377
- ],
378
- )
442
+ # Extract all features in one pass
443
+ features = extractor.extract_all_features(sound)
444
+ feature_list.append(features)
379
445
 
380
- # add pca data
381
- pca_data = run_pca(df) # Run jitter and shimmer PCA
382
- df = pd.concat([df, pca_data], axis=1) # Add PCA data
383
- # reload the data so it's all numbers
384
- # df.to_csv("processed_results.csv", index=False)
385
- # df = pd.read_csv("processed_results.csv", header=0)
386
- # df.sort_values('voiceID').head(20)
387
- # ## Next we calculate the vocal-tract length estimates
388
-
389
- # ### Formant position
390
- # Puts, D. A., Apicella, C. L., & Cárdenas, R. A. (2012). Masculine voices signal men's threat potential in forager and industrial societies. Proceedings of the Royal Society of London B: Biological Sciences, 279(1728), 601-609.
391
-
392
- df["pF"] = (
393
- zscore(df.f1_median)
394
- + zscore(df.f2_median)
395
- + zscore(df.f3_median)
396
- + zscore(df.f4_median)
397
- ) / 4
398
-
399
- # ### Formant Dispersion
400
- # Fitch, W. T. (1997). Vocal tract length and formant frequency dispersion correlate with body size in rhesus macaques. The Journal of the Acoustical Society of America, 102(2), 1213-1222.
401
-
402
- df["fdisp"] = (df["f4_median"] - df["f1_median"]) / 3
403
-
404
- # ### Fn (Average Formant)
405
- # Pisanski, K., & Rendall, D. (2011). The prioritization of voice fundamental frequency or formants in listeners’ assessments of speaker size, masculinity, and attractiveness. The Journal of the Acoustical Society of America, 129(4), 2201-2212.
406
-
407
- df["avgFormant"] = (
408
- df["f1_median"] + df["f2_median"] + df["f3_median"] + df["f4_median"]
409
- ) / 4
410
-
411
- # ### MFF
412
- # Smith, D. R., & Patterson, R. D. (2005). The interaction of glottal-pulse rate and vocal-tract length in judgements of speaker size, sex, and age. The Journal of the Acoustical Society of America, 118(5), 3177-3186.
413
-
414
- df["mff"] = (
415
- df["f1_median"] * df["f2_median"] * df["f3_median"] * df["f4_median"]
416
- ) ** 0.25
417
-
418
- # ### Fitch VTL
419
- # Fitch, W. T. (1997). Vocal tract length and formant frequency dispersion correlate with body size in rhesus macaques. The Journal of the Acoustical Society of America, 102(2), 1213-1222.
420
-
421
- # reload the data again
422
- # df.to_csv("processed_results.csv", index=False)
423
- # df = pd.read_csv('processed_results.csv', header=0)
424
-
425
- df["fitch_vtl"] = (
426
- (1 * (35000 / (4 * df["f1_median"])))
427
- + (3 * (35000 / (4 * df["f2_median"])))
428
- + (5 * (35000 / (4 * df["f3_median"])))
429
- + (7 * (35000 / (4 * df["f4_median"])))
430
- ) / 4
431
-
432
- # ### $\Delta$F
433
- # Reby,D.,& McComb,K.(2003). Anatomical constraints generate honesty: acoustic cues to age and weight in the roars of red deer stags. Animal Behaviour, 65, 519e-530.
434
-
435
- xysum = (
436
- (0.5 * df["f1_median"])
437
- + (1.5 * df["f2_median"])
438
- + (2.5 * df["f3_median"])
439
- + (3.5 * df["f4_median"])
446
+ except Exception as errors:
447
+ print(f"error on file {wave_file}: {errors}")
448
+ # Add empty feature dict for failed files
449
+ feature_list.append(
450
+ {
451
+ key: np.nan
452
+ for key in ["duration", "meanF0Hz", "stdevF0Hz", "HNR"]
453
+ + [
454
+ f"f{i}_{stat}"
455
+ for i in range(1, 5)
456
+ for stat in ["mean", "median"]
457
+ ]
458
+ + [
459
+ "localJitter",
460
+ "localabsoluteJitter",
461
+ "rapJitter",
462
+ "ppq5Jitter",
463
+ "ddpJitter",
464
+ "localShimmer",
465
+ "localdbShimmer",
466
+ "apq3Shimmer",
467
+ "apq5Shimmer",
468
+ "apq11Shimmer",
469
+ "ddaShimmer",
470
+ ]
471
+ }
472
+ )
473
+
474
+ # Create DataFrame directly from feature list
475
+ df = pd.DataFrame(feature_list)
476
+
477
+ # Add derived features efficiently
478
+ df = add_derived_features(df)
479
+
480
+ print(
481
+ f"Feature extraction completed. Total features extracted: {len(df.columns) if 'df' in locals() else '~43'}"
440
482
  )
441
- xsquaredsum = (0.5**2) + (1.5**2) + (2.5**2) + (3.5**2)
442
- df["delta_f"] = xysum / xsquaredsum
443
-
444
- # ### VTL($\Delta$F)
445
- # Reby,D.,&McComb,K.(2003).Anatomical constraints generate honesty: acoustic cues to age and weight in the roars of red deer stags. Animal Behaviour, 65, 519e-530.
446
-
447
- df["vtl_delta_f"] = 35000 / (2 * df["delta_f"])
483
+ return df
448
484
 
449
- print("Now extracting speech rate parameters...")
450
485
 
451
- df_speechrate = get_speech_rate(file_index)
452
- print("")
486
+ def add_derived_features(df):
487
+ """Add PCA and vocal tract length features efficiently."""
488
+ # PCA on jitter/shimmer
489
+ pca_data = run_pca(df)
490
+ df = pd.concat([df, pca_data], axis=1)
491
+
492
+ # Vectorized vocal tract calculations
493
+ with np.errstate(divide="ignore", invalid="ignore"):
494
+ df["pF"] = (
495
+ zscore(df.f1_median)
496
+ + zscore(df.f2_median)
497
+ + zscore(df.f3_median)
498
+ + zscore(df.f4_median)
499
+ ) / 4
500
+ df["fdisp"] = (df["f4_median"] - df["f1_median"]) / 3
501
+ df["avgFormant"] = (
502
+ df["f1_median"] + df["f2_median"] + df["f3_median"] + df["f4_median"]
503
+ ) / 4
504
+ df["mff"] = (
505
+ df["f1_median"] * df["f2_median"] * df["f3_median"] * df["f4_median"]
506
+ ) ** 0.25
507
+
508
+ # Fitch VTL calculation
509
+ df["fitch_vtl"] = (
510
+ (1 * (35000 / (4 * df["f1_median"])))
511
+ + (3 * (35000 / (4 * df["f2_median"])))
512
+ + (5 * (35000 / (4 * df["f3_median"])))
513
+ + (7 * (35000 / (4 * df["f4_median"])))
514
+ ) / 4
515
+
516
+ # Delta F calculation
517
+ xysum = (
518
+ 0.5 * df["f1_median"]
519
+ + 1.5 * df["f2_median"]
520
+ + 2.5 * df["f3_median"]
521
+ + 3.5 * df["f4_median"]
522
+ )
523
+ xsquaredsum = 0.5**2 + 1.5**2 + 2.5**2 + 3.5**2
524
+ df["delta_f"] = xysum / xsquaredsum
525
+ df["vtl_delta_f"] = 35000 / (2 * df["delta_f"])
453
526
 
454
- return df.join(df_speechrate)
527
+ return df
455
528
 
456
529
 
457
530
  """
@@ -469,6 +542,12 @@ def get_speech_rate(file_index):
469
542
  "speechrate_nsyll_dur",
470
543
  "articulation_rate_nsyll_phonationtime",
471
544
  "ASD_speakingtime_nsyll",
545
+ "pause_lognorm_mu",
546
+ "pause_lognorm_sigma",
547
+ "pause_lognorm_ks_pvalue",
548
+ "pause_mean_duration",
549
+ "pause_std_duration",
550
+ "pause_cv",
472
551
  ]
473
552
  datalist = []
474
553
  for idx, (wave_file, start, end) in enumerate(tqdm(file_index.to_list())):
@@ -527,6 +606,8 @@ def speech_rate(sound):
527
606
  silencetable = call(silencetier, "Down to TableOfReal", "sounding")
528
607
  npauses = call(silencetable, "Get number of rows")
529
608
  speakingtot = 0
609
+ pause_durations = [] # Store individual pause durations
610
+
530
611
  for ipause in range(npauses):
531
612
  pause = ipause + 1
532
613
  beginsound = call(silencetable, "Get value", pause, 1)
@@ -534,6 +615,53 @@ def speech_rate(sound):
534
615
  speakingdur = endsound - beginsound
535
616
  speakingtot += speakingdur
536
617
 
618
+ # Calculate pause duration (time between speaking segments)
619
+ if ipause > 0:
620
+ prev_pause = ipause
621
+ prev_endsound = call(silencetable, "Get value", prev_pause, 2)
622
+ pause_duration = beginsound - prev_endsound
623
+ if pause_duration > 0: # Only include positive pause durations
624
+ pause_durations.append(pause_duration)
625
+
626
+ # Calculate pause duration distribution parameters
627
+ pause_lognorm_mu = np.nan
628
+ pause_lognorm_sigma = np.nan
629
+ pause_lognorm_ks_pvalue = np.nan
630
+ pause_mean_duration = np.nan
631
+ pause_std_duration = np.nan
632
+ pause_cv = np.nan
633
+
634
+ if len(pause_durations) >= 3: # Need minimum samples for distribution fitting
635
+ try:
636
+ # Fit lognormal distribution to pause durations
637
+ pause_durations_array = np.array(pause_durations)
638
+
639
+ # Calculate basic statistics
640
+ pause_mean_duration = np.mean(pause_durations_array)
641
+ pause_std_duration = np.std(pause_durations_array)
642
+ pause_cv = (
643
+ pause_std_duration / pause_mean_duration
644
+ if pause_mean_duration > 0
645
+ else 0
646
+ )
647
+
648
+ # Fit lognormal distribution
649
+ shape, loc, scale = lognorm.fit(pause_durations_array, floc=0)
650
+ pause_lognorm_sigma = shape # shape parameter (sigma)
651
+ pause_lognorm_mu = np.log(scale) # location parameter (mu)
652
+
653
+ # Test goodness of fit using Kolmogorov-Smirnov test
654
+ ks_stat, pause_lognorm_ks_pvalue = stats.kstest(
655
+ pause_durations_array,
656
+ lambda x: lognorm.cdf(x, shape, loc=loc, scale=scale),
657
+ )
658
+
659
+ except (ValueError, RuntimeError) as e:
660
+ print(f"Error fitting lognormal distribution to pause durations: {e}")
661
+
662
+ # Calculate pause duration
663
+ pausetot = originaldur - speakingtot
664
+
537
665
  intensity_matrix = call(intensity, "Down to Matrix")
538
666
  # sndintid = sound_from_intensity_matrix
539
667
  sound_from_intensity_matrix = call(intensity_matrix, "To Sound (slice)", 1)
@@ -567,8 +695,8 @@ def speech_rate(sound):
567
695
  # fill array with valid peaks: only intensity values if preceding
568
696
  # dip in intensity is greater than mindip
569
697
  validpeakcount = 0
570
- currenttime = timepeaks[0]
571
- currentint = intensities[0]
698
+ currenttime = timepeaks[0] if timepeaks else 0
699
+ currentint = intensities[0] if intensities else 0
572
700
  validtime = []
573
701
 
574
702
  for p in range(peakcount - 1):
@@ -609,20 +737,35 @@ def speech_rate(sound):
609
737
 
610
738
  # return results
611
739
  speakingrate = voicedcount / originaldur
612
- articulationrate = voicedcount / speakingtot
740
+ articulationrate = voicedcount / speakingtot if speakingtot > 0 else 0
613
741
  npause = npauses - 1
614
742
  try:
615
743
  asd = speakingtot / voicedcount
616
744
  except ZeroDivisionError:
617
745
  asd = 0
618
746
  print("caught zero division")
747
+
748
+ # Calculate proportion pause duration
749
+ try:
750
+ proportion_pause_duration = pausetot / speakingtot
751
+ except ZeroDivisionError:
752
+ proportion_pause_duration = 0
753
+ print("caught zero division for proportion pause duration")
754
+
619
755
  speechrate_dictionary = {
620
756
  "nsyll": voicedcount,
621
757
  "npause": npause,
622
- "dur_s": originaldur,
758
+ # "dur_s": originaldur,
623
759
  "phonationtime_s": intensity_duration,
624
760
  "speechrate_nsyll_dur": speakingrate,
625
761
  "articulation_rate_nsyll_phonationtime": articulationrate,
626
762
  "ASD_speakingtime_nsyll": asd,
763
+ "proportion_pause_duration": proportion_pause_duration,
764
+ "pause_lognorm_mu": pause_lognorm_mu,
765
+ "pause_lognorm_sigma": pause_lognorm_sigma,
766
+ "pause_lognorm_ks_pvalue": pause_lognorm_ks_pvalue,
767
+ "pause_mean_duration": pause_mean_duration,
768
+ "pause_std_duration": pause_std_duration,
769
+ "pause_cv": pause_cv,
627
770
  }
628
771
  return speechrate_dictionary