nkululeko 0.94.3__py3-none-any.whl → 0.95.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. nkululeko/augmenting/resampler.py +5 -2
  2. nkululeko/autopredict/ap_emotion.py +36 -0
  3. nkululeko/autopredict/ap_text.py +45 -0
  4. nkululeko/autopredict/tests/__init__.py +0 -0
  5. nkululeko/autopredict/tests/test_whisper_transcriber.py +122 -0
  6. nkululeko/autopredict/whisper_transcriber.py +81 -0
  7. nkululeko/balance.py +222 -0
  8. nkululeko/constants.py +1 -1
  9. nkululeko/experiment.py +53 -3
  10. nkululeko/explore.py +32 -13
  11. nkululeko/feat_extract/feats_analyser.py +45 -17
  12. nkululeko/feat_extract/feats_emotion2vec.py +51 -26
  13. nkululeko/feat_extract/feats_praat.py +3 -3
  14. nkululeko/feat_extract/feats_praat_core.py +769 -0
  15. nkululeko/feat_extract/tests/__init__.py +1 -0
  16. nkululeko/feat_extract/tests/test_feats_opensmile.py +162 -0
  17. nkululeko/feat_extract/tests/test_feats_praat_core.py +507 -0
  18. nkululeko/glob_conf.py +9 -0
  19. nkululeko/modelrunner.py +15 -39
  20. nkululeko/models/model.py +4 -42
  21. nkululeko/models/model_tuned.py +416 -84
  22. nkululeko/models/model_xgb.py +148 -2
  23. nkululeko/models/tests/test_model_knn.py +49 -0
  24. nkululeko/models/tests/test_model_mlp.py +153 -0
  25. nkululeko/models/tests/test_model_xgb.py +33 -0
  26. nkululeko/nkululeko.py +0 -9
  27. nkululeko/plots.py +25 -19
  28. nkululeko/predict.py +8 -6
  29. nkululeko/reporting/report.py +7 -5
  30. nkululeko/reporting/reporter.py +20 -5
  31. nkululeko/test_predictor.py +7 -1
  32. nkululeko/tests/__init__.py +1 -0
  33. nkululeko/tests/test_balancing.py +270 -0
  34. nkululeko/utils/util.py +38 -6
  35. {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/METADATA +1 -1
  36. {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/RECORD +40 -27
  37. nkululeko/feat_extract/feats_opensmile copy.py +0 -93
  38. nkululeko/feat_extract/feinberg_praat.py +0 -628
  39. {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/WHEEL +0 -0
  40. {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/entry_points.txt +0 -0
  41. {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/licenses/LICENSE +0 -0
  42. {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,769 @@
1
+ """This is a copy of David R. Feinberg's Praat scripts.
2
+ https://github.com/drfeinberg/PraatScripts
3
+ taken June 23rd 2022.
4
+
5
+ 2025-05-06: Optimized for faster computation (bta).
6
+ """
7
+
8
+ #!/usr/bin/env python3
9
+ import math
10
+ import statistics
11
+
12
+ import audiofile
13
+ import numpy as np
14
+ import pandas as pd
15
+ import parselmouth
16
+ from parselmouth.praat import call
17
+ from scipy.stats.mstats import zscore
18
+ from scipy.stats import lognorm
19
+ from scipy import stats
20
+ from sklearn.decomposition import PCA
21
+ from tqdm import tqdm
22
+
23
+
24
+ class AudioFeatureExtractor:
25
+ """Optimized audio feature extraction class to avoid redundant calculations."""
26
+
27
+ def __init__(self, f0min=75, f0max=300):
28
+ self.f0min = f0min
29
+ self.f0max = f0max
30
+
31
+ def extract_all_features(self, sound):
32
+ """Extract all acoustic features from a single sound object."""
33
+ # Cache common objects to avoid redundant calculations
34
+ duration = sound.get_total_duration()
35
+ pitch = call(sound, "To Pitch", 0.0, self.f0min, self.f0max)
36
+ point_process = call(
37
+ sound, "To PointProcess (periodic, cc)", self.f0min, self.f0max
38
+ )
39
+
40
+ # Extract pitch-related features
41
+ pitch_features = self._extract_pitch_features(sound, pitch, point_process)
42
+
43
+ # Extract formant features
44
+ formant_features = self._extract_formant_features(sound, point_process)
45
+
46
+ # Extract speech rate and pause features
47
+ speech_features = self._extract_speech_features(sound)
48
+
49
+ # Combine all features
50
+ all_features = {
51
+ "duration": duration,
52
+ **pitch_features,
53
+ **formant_features,
54
+ **speech_features,
55
+ }
56
+
57
+ return all_features
58
+
59
+ def _extract_pitch_features(self, sound, pitch, point_process):
60
+ """Extract pitch, jitter, shimmer, and HNR features."""
61
+ # Pitch statistics
62
+ mean_f0 = call(pitch, "Get mean", 0, 0, "Hertz")
63
+ stdev_f0 = call(pitch, "Get standard deviation", 0, 0, "Hertz")
64
+
65
+ # HNR
66
+ harmonicity = call(sound, "To Harmonicity (cc)", 0.01, self.f0min, 0.1, 1.0)
67
+ hnr = call(harmonicity, "Get mean", 0, 0)
68
+
69
+ # Jitter measures
70
+ local_jitter = call(
71
+ point_process, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3
72
+ )
73
+ localabsolute_jitter = call(
74
+ point_process, "Get jitter (local, absolute)", 0, 0, 0.0001, 0.02, 1.3
75
+ )
76
+ rap_jitter = call(point_process, "Get jitter (rap)", 0, 0, 0.0001, 0.02, 1.3)
77
+ ppq5_jitter = call(point_process, "Get jitter (ppq5)", 0, 0, 0.0001, 0.02, 1.3)
78
+ ddp_jitter = call(point_process, "Get jitter (ddp)", 0, 0, 0.0001, 0.02, 1.3)
79
+
80
+ # Shimmer measures (reuse point_process)
81
+ shimmer_params = [0, 0, 0.0001, 0.02, 1.3, 1.6]
82
+ local_shimmer = call(
83
+ [sound, point_process], "Get shimmer (local)", *shimmer_params
84
+ )
85
+ localdb_shimmer = call(
86
+ [sound, point_process], "Get shimmer (local_dB)", *shimmer_params
87
+ )
88
+ apq3_shimmer = call(
89
+ [sound, point_process], "Get shimmer (apq3)", *shimmer_params
90
+ )
91
+ apq5_shimmer = call(
92
+ [sound, point_process], "Get shimmer (apq5)", *shimmer_params
93
+ )
94
+ apq11_shimmer = call(
95
+ [sound, point_process], "Get shimmer (apq11)", *shimmer_params
96
+ )
97
+ dda_shimmer = call([sound, point_process], "Get shimmer (dda)", *shimmer_params)
98
+
99
+ return {
100
+ "meanF0Hz": mean_f0,
101
+ "stdevF0Hz": stdev_f0,
102
+ "HNR": hnr,
103
+ "localJitter": local_jitter,
104
+ "localabsoluteJitter": localabsolute_jitter,
105
+ "rapJitter": rap_jitter,
106
+ "ppq5Jitter": ppq5_jitter,
107
+ "ddpJitter": ddp_jitter,
108
+ "localShimmer": local_shimmer,
109
+ "localdbShimmer": localdb_shimmer,
110
+ "apq3Shimmer": apq3_shimmer,
111
+ "apq5Shimmer": apq5_shimmer,
112
+ "apq11Shimmer": apq11_shimmer,
113
+ "ddaShimmer": dda_shimmer,
114
+ }
115
+
116
+ def _extract_formant_features(self, sound, point_process):
117
+ """Extract formant features efficiently."""
118
+ formants = call(sound, "To Formant (burg)", 0.0025, 5, 5000, 0.025, 50)
119
+ num_points = call(point_process, "Get number of points")
120
+
121
+ # Pre-allocate arrays for better performance
122
+ f1_values = []
123
+ f2_values = []
124
+ f3_values = []
125
+ f4_values = []
126
+
127
+ # Single loop to extract all formants
128
+ for point in range(num_points):
129
+ t = call(point_process, "Get time from index", point + 1)
130
+ f1 = call(formants, "Get value at time", 1, t, "Hertz", "Linear")
131
+ f2 = call(formants, "Get value at time", 2, t, "Hertz", "Linear")
132
+ f3 = call(formants, "Get value at time", 3, t, "Hertz", "Linear")
133
+ f4 = call(formants, "Get value at time", 4, t, "Hertz", "Linear")
134
+
135
+ # Filter out NaN values during collection
136
+ if not math.isnan(f1):
137
+ f1_values.append(f1)
138
+ if not math.isnan(f2):
139
+ f2_values.append(f2)
140
+ if not math.isnan(f3):
141
+ f3_values.append(f3)
142
+ if not math.isnan(f4):
143
+ f4_values.append(f4)
144
+
145
+ # Calculate statistics only once
146
+ f1_mean = statistics.mean(f1_values) if f1_values else np.nan
147
+ f2_mean = statistics.mean(f2_values) if f2_values else np.nan
148
+ f3_mean = statistics.mean(f3_values) if f3_values else np.nan
149
+ f4_mean = statistics.mean(f4_values) if f4_values else np.nan
150
+
151
+ f1_median = statistics.median(f1_values) if f1_values else np.nan
152
+ f2_median = statistics.median(f2_values) if f2_values else np.nan
153
+ f3_median = statistics.median(f3_values) if f3_values else np.nan
154
+ f4_median = statistics.median(f4_values) if f4_values else np.nan
155
+
156
+ return {
157
+ "f1_mean": f1_mean,
158
+ "f2_mean": f2_mean,
159
+ "f3_mean": f3_mean,
160
+ "f4_mean": f4_mean,
161
+ "f1_median": f1_median,
162
+ "f2_median": f2_median,
163
+ "f3_median": f3_median,
164
+ "f4_median": f4_median,
165
+ }
166
+
167
+ def _extract_speech_features(self, sound):
168
+ """Extract speech rate and pause features with lognormal distribution analysis."""
169
+ silencedb = -25
170
+ mindip = 2
171
+ minpause = 0.3
172
+ originaldur = sound.get_total_duration()
173
+
174
+ # Reuse intensity object for multiple calculations
175
+ intensity = sound.to_intensity(50)
176
+ max_99_intensity = call(intensity, "Get quantile", 0, 0, 0.99)
177
+ min_intensity = call(intensity, "Get minimum", 0, 0, "Parabolic")
178
+ max_intensity = call(intensity, "Get maximum", 0, 0, "Parabolic")
179
+
180
+ # Calculate threshold once
181
+ threshold = max_99_intensity + silencedb
182
+ threshold2 = max_intensity - max_99_intensity
183
+ threshold3 = silencedb - threshold2
184
+ if threshold < min_intensity:
185
+ threshold = min_intensity
186
+
187
+ # Extract silences and calculate pause durations
188
+ textgrid = call(
189
+ intensity,
190
+ "To TextGrid (silences)",
191
+ threshold3,
192
+ minpause,
193
+ 0.1,
194
+ "silent",
195
+ "sounding",
196
+ )
197
+ silencetier = call(textgrid, "Extract tier", 1)
198
+ silencetable = call(silencetier, "Down to TableOfReal", "sounding")
199
+ npauses = call(silencetable, "Get number of rows")
200
+
201
+ speakingtot = 0
202
+ pause_durations = []
203
+
204
+ # Single loop for speaking time and pause duration calculation
205
+ for ipause in range(npauses):
206
+ pause = ipause + 1
207
+ beginsound = call(silencetable, "Get value", pause, 1)
208
+ endsound = call(silencetable, "Get value", pause, 2)
209
+ speakingdur = endsound - beginsound
210
+ speakingtot += speakingdur
211
+
212
+ if ipause > 0:
213
+ prev_endsound = call(silencetable, "Get value", ipause, 2)
214
+ pause_duration = beginsound - prev_endsound
215
+ if pause_duration > 0:
216
+ pause_durations.append(pause_duration)
217
+
218
+ # Calculate pause distribution features
219
+ pause_features = self._calculate_pause_distribution(pause_durations)
220
+
221
+ # Efficient syllable counting
222
+ syllable_features = self._count_syllables_optimized(
223
+ sound, intensity, textgrid, threshold, mindip, originaldur
224
+ )
225
+
226
+ pausetot = originaldur - speakingtot
227
+ proportion_pause_duration = pausetot / speakingtot if speakingtot > 0 else 0
228
+
229
+ return {
230
+ **pause_features,
231
+ **syllable_features,
232
+ "proportion_pause_duration": proportion_pause_duration,
233
+ }
234
+
235
+ def _calculate_pause_distribution(self, pause_durations):
236
+ """Calculate lognormal distribution parameters for pause durations."""
237
+ pause_lognorm_mu = np.nan
238
+ pause_lognorm_sigma = np.nan
239
+ pause_lognorm_ks_pvalue = np.nan
240
+ pause_mean_duration = np.nan
241
+ pause_std_duration = np.nan
242
+ pause_cv = np.nan
243
+
244
+ if len(pause_durations) >= 3:
245
+ try:
246
+ pause_durations_array = np.array(pause_durations)
247
+ pause_mean_duration = np.mean(pause_durations_array)
248
+ pause_std_duration = np.std(pause_durations_array)
249
+ pause_cv = (
250
+ pause_std_duration / pause_mean_duration
251
+ if pause_mean_duration > 0
252
+ else 0
253
+ )
254
+
255
+ shape, loc, scale = lognorm.fit(pause_durations_array, floc=0)
256
+ pause_lognorm_sigma = shape
257
+ pause_lognorm_mu = np.log(scale)
258
+
259
+ ks_stat, pause_lognorm_ks_pvalue = stats.kstest(
260
+ pause_durations_array,
261
+ lambda x: lognorm.cdf(x, shape, loc=loc, scale=scale),
262
+ )
263
+ except (ValueError, RuntimeError) as e:
264
+ print(f"Error fitting lognormal distribution: {e}")
265
+
266
+ return {
267
+ "pause_lognorm_mu": pause_lognorm_mu,
268
+ "pause_lognorm_sigma": pause_lognorm_sigma,
269
+ "pause_lognorm_ks_pvalue": pause_lognorm_ks_pvalue,
270
+ "pause_mean_duration": pause_mean_duration,
271
+ "pause_std_duration": pause_std_duration,
272
+ "pause_cv": pause_cv,
273
+ }
274
+
275
+ def _count_syllables_optimized(
276
+ self, sound, intensity, textgrid, threshold, mindip, originaldur
277
+ ):
278
+ """Optimized syllable counting avoiding redundant matrix operations."""
279
+ intensity_matrix = call(intensity, "Down to Matrix")
280
+ sound_from_intensity_matrix = call(intensity_matrix, "To Sound (slice)", 1)
281
+ intensity_duration = call(sound_from_intensity_matrix, "Get total duration")
282
+
283
+ point_process = call(
284
+ sound_from_intensity_matrix,
285
+ "To PointProcess (extrema)",
286
+ "Left",
287
+ "yes",
288
+ "no",
289
+ "Sinc70",
290
+ )
291
+ numpeaks = call(point_process, "Get number of points")
292
+
293
+ # Vectorized time extraction
294
+ timepeaks = []
295
+ intensities = []
296
+
297
+ for i in range(numpeaks):
298
+ t = call(point_process, "Get time from index", i + 1)
299
+ value = call(sound_from_intensity_matrix, "Get value at time", t, "Cubic")
300
+ if value > threshold:
301
+ timepeaks.append(t)
302
+ intensities.append(value)
303
+
304
+ # Optimized peak validation
305
+ validtime = []
306
+ if len(timepeaks) > 1:
307
+ for p in range(len(timepeaks) - 1):
308
+ currenttime = timepeaks[p]
309
+ currentint = intensities[p]
310
+ dip = call(
311
+ intensity, "Get minimum", currenttime, timepeaks[p + 1], "None"
312
+ )
313
+ if abs(currentint - dip) > mindip:
314
+ validtime.append(timepeaks[p])
315
+
316
+ # Count voiced syllables
317
+ pitch = sound.to_pitch_ac(0.02, 30, 4, False, 0.03, 0.25, 0.01, 0.35, 0.25, 450)
318
+ voicedcount = 0
319
+
320
+ for querytime in validtime:
321
+ whichinterval = call(textgrid, "Get interval at time", 1, querytime)
322
+ whichlabel = call(textgrid, "Get label of interval", 1, whichinterval)
323
+ pitch_value = pitch.get_value_at_time(querytime)
324
+ if not math.isnan(pitch_value) and whichlabel == "sounding":
325
+ voicedcount += 1
326
+
327
+ # Get silencetable for speaking time calculation
328
+ silencetier = call(textgrid, "Extract tier", 1)
329
+ silencetable = call(silencetier, "Down to TableOfReal", "sounding")
330
+ npauses = call(silencetable, "Get number of rows")
331
+
332
+ # Calculate speaking time
333
+ speakingtot = 0
334
+ for i in range(npauses):
335
+ beginsound = call(silencetable, "Get value", i + 1, 1)
336
+ endsound = call(silencetable, "Get value", i + 1, 2)
337
+ speakingtot += endsound - beginsound
338
+
339
+ # Calculate rates
340
+ speakingrate = voicedcount / originaldur
341
+ articulationrate = voicedcount / speakingtot if speakingtot > 0 else 0
342
+ asd = speakingtot / voicedcount if voicedcount > 0 else 0
343
+
344
+ return {
345
+ "nsyll": voicedcount,
346
+ "npause": npauses - 1,
347
+ "phonationtime_s": intensity_duration,
348
+ "speechrate_nsyll_dur": speakingrate,
349
+ "articulation_rate_nsyll_phonationtime": articulationrate,
350
+ "ASD_speakingtime_nsyll": asd,
351
+ }
352
+
353
+
354
+ # ## This function runs a 2-factor Principle Components Analysis (PCA) on Jitter and Shimmer
355
+
356
+
357
+ def run_pca(df):
358
+ # z-score the Jitter and Shimmer measurements
359
+ measures = [
360
+ "localJitter",
361
+ "localabsoluteJitter",
362
+ "rapJitter",
363
+ "ppq5Jitter",
364
+ "ddpJitter",
365
+ "localShimmer",
366
+ "localdbShimmer",
367
+ "apq3Shimmer",
368
+ "apq5Shimmer",
369
+ "apq11Shimmer",
370
+ "ddaShimmer",
371
+ ]
372
+ x = df.loc[:, measures].values
373
+ # f = open('x.pickle', 'wb')
374
+ # pickle.dump(x, f)
375
+ # f.close()
376
+
377
+ # x = StandardScaler().fit_transform(x)
378
+ if np.any(np.isnan(x[0])):
379
+ print(
380
+ f"Warning: {np.count_nonzero(np.isnan(x))} Nans in x, replacing" " with 0"
381
+ )
382
+ x[np.isnan(x)] = 0
383
+ # if np.any(np.isfinite(x[0])):
384
+ # print(f"Warning: {np.count_nonzero(np.isfinite(x))} finite in x")
385
+
386
+ # PCA
387
+ pca = PCA(n_components=2)
388
+ try:
389
+ principal_components = pca.fit_transform(x)
390
+ if np.any(np.isnan(principal_components)):
391
+ print("pc is nan")
392
+ print(f"count: {np.count_nonzero(np.isnan(principal_components))}")
393
+ print(principal_components)
394
+ principal_components = np.nan_to_num(principal_components)
395
+ except ValueError:
396
+ print("need more than one file for pca")
397
+ principal_components = [[0, 0]]
398
+ principal_df = pd.DataFrame(
399
+ data=principal_components, columns=["JitterPCA", "ShimmerPCA"]
400
+ )
401
+ return principal_df
402
+
403
+
404
+ # ## This block of code runs the above functions on all of the '.wav' files in the /audio folder
405
+
406
+
407
+ def compute_features(file_index):
408
+ """Optimized feature computation using AudioFeatureExtractor class.
409
+
410
+ FEATURE COUNT COMPARISON:
411
+ Original version: ~36 features
412
+ - Basic: duration, meanF0Hz, stdevF0Hz, HNR (4)
413
+ - Jitter: localJitter, localabsoluteJitter, rapJitter, ppq5Jitter, ddpJitter (5)
414
+ - Shimmer: localShimmer, localdbShimmer, apq3Shimmer, apq5Shimmer, apq11Shimmer, ddaShimmer (6)
415
+ - Formants: f1-f4 mean/median (8)
416
+ - PCA: JitterPCA, ShimmerPCA (2)
417
+ - VTL: pF, fdisp, avgFormant, mff, fitch_vtl, delta_f, vtl_delta_f (7)
418
+ - Speech rate: nsyll, npause, phonationtime_s, speechrate_nsyll_dur,
419
+ articulation_rate_nsyll_phonationtime, ASD_speakingtime_nsyll (6)
420
+
421
+ Current optimized version: ~42 features (+6 new pause distribution features)
422
+ - All original 36 features PLUS:
423
+ - Pause distribution: pause_lognorm_mu, pause_lognorm_sigma, pause_lognorm_ks_pvalue,
424
+ pause_mean_duration, pause_std_duration, pause_cv (6)
425
+ - Additional: proportion_pause_duration (1)
426
+
427
+ Total: 43 features (7 new features added for AD detection)
428
+ """
429
+ extractor = AudioFeatureExtractor()
430
+ feature_list = []
431
+
432
+ for idx, (wave_file, start, end) in enumerate(tqdm(file_index.to_list())):
433
+ try:
434
+ signal, sampling_rate = audiofile.read(
435
+ wave_file,
436
+ offset=start.total_seconds(),
437
+ duration=(end - start).total_seconds(),
438
+ always_2d=True,
439
+ )
440
+ sound = parselmouth.Sound(values=signal, sampling_frequency=sampling_rate)
441
+
442
+ # Extract all features in one pass
443
+ features = extractor.extract_all_features(sound)
444
+ feature_list.append(features)
445
+
446
+ except Exception as errors:
447
+ print(f"error on file {wave_file}: {errors}")
448
+ # Add empty feature dict for failed files
449
+ feature_list.append(
450
+ {
451
+ key: np.nan
452
+ for key in ["duration", "meanF0Hz", "stdevF0Hz", "HNR"]
453
+ + [
454
+ f"f{i}_{stat}"
455
+ for i in range(1, 5)
456
+ for stat in ["mean", "median"]
457
+ ]
458
+ + [
459
+ "localJitter",
460
+ "localabsoluteJitter",
461
+ "rapJitter",
462
+ "ppq5Jitter",
463
+ "ddpJitter",
464
+ "localShimmer",
465
+ "localdbShimmer",
466
+ "apq3Shimmer",
467
+ "apq5Shimmer",
468
+ "apq11Shimmer",
469
+ "ddaShimmer",
470
+ ]
471
+ }
472
+ )
473
+
474
+ # Create DataFrame directly from feature list
475
+ df = pd.DataFrame(feature_list)
476
+
477
+ # Add derived features efficiently
478
+ df = add_derived_features(df)
479
+
480
+ print(
481
+ f"Feature extraction completed. Total features extracted: {len(df.columns) if 'df' in locals() else '~43'}"
482
+ )
483
+ return df
484
+
485
+
486
+ def add_derived_features(df):
487
+ """Add PCA and vocal tract length features efficiently."""
488
+ # PCA on jitter/shimmer
489
+ pca_data = run_pca(df)
490
+ df = pd.concat([df, pca_data], axis=1)
491
+
492
+ # Vectorized vocal tract calculations
493
+ with np.errstate(divide="ignore", invalid="ignore"):
494
+ df["pF"] = (
495
+ zscore(df.f1_median)
496
+ + zscore(df.f2_median)
497
+ + zscore(df.f3_median)
498
+ + zscore(df.f4_median)
499
+ ) / 4
500
+ df["fdisp"] = (df["f4_median"] - df["f1_median"]) / 3
501
+ df["avgFormant"] = (
502
+ df["f1_median"] + df["f2_median"] + df["f3_median"] + df["f4_median"]
503
+ ) / 4
504
+ df["mff"] = (
505
+ df["f1_median"] * df["f2_median"] * df["f3_median"] * df["f4_median"]
506
+ ) ** 0.25
507
+
508
+ # Fitch VTL calculation
509
+ df["fitch_vtl"] = (
510
+ (1 * (35000 / (4 * df["f1_median"])))
511
+ + (3 * (35000 / (4 * df["f2_median"])))
512
+ + (5 * (35000 / (4 * df["f3_median"])))
513
+ + (7 * (35000 / (4 * df["f4_median"])))
514
+ ) / 4
515
+
516
+ # Delta F calculation
517
+ xysum = (
518
+ 0.5 * df["f1_median"]
519
+ + 1.5 * df["f2_median"]
520
+ + 2.5 * df["f3_median"]
521
+ + 3.5 * df["f4_median"]
522
+ )
523
+ xsquaredsum = 0.5**2 + 1.5**2 + 2.5**2 + 3.5**2
524
+ df["delta_f"] = xysum / xsquaredsum
525
+ df["vtl_delta_f"] = 35000 / (2 * df["delta_f"])
526
+
527
+ return df
528
+
529
+
530
+ """
531
+ Speech rate script taken from https://github.com/drfeinberg/PraatScripts
532
+ on 25/05/23
533
+ """
534
+
535
+
536
+ def get_speech_rate(file_index):
537
+ cols = [
538
+ "nsyll",
539
+ "npause",
540
+ "phonationtime_s",
541
+ "speechrate_nsyll_dur",
542
+ "articulation_rate_nsyll_phonationtime",
543
+ "ASD_speakingtime_nsyll",
544
+ "pause_lognorm_mu",
545
+ "pause_lognorm_sigma",
546
+ "pause_lognorm_ks_pvalue",
547
+ "pause_mean_duration",
548
+ "pause_std_duration",
549
+ "pause_cv",
550
+ ]
551
+ datalist = []
552
+ for idx, (wave_file, start, end) in enumerate(tqdm(file_index.to_list())):
553
+ signal, sampling_rate = audiofile.read(
554
+ wave_file,
555
+ offset=start.total_seconds(),
556
+ duration=(end - start).total_seconds(),
557
+ always_2d=True,
558
+ )
559
+ try:
560
+ sound = parselmouth.Sound(values=signal, sampling_frequency=sampling_rate)
561
+ # print(f'processing {file}')
562
+ speechrate_dictionary = speech_rate(sound)
563
+ datalist.append(speechrate_dictionary)
564
+ except IndexError as ie:
565
+ print(f"error extracting speech-rate on file {wave_file}: {ie}")
566
+ except parselmouth.PraatError as pe:
567
+ print(f"error extracting speech-rate on file {wave_file}: {pe}")
568
+ df = pd.DataFrame(datalist)
569
+ return df
570
+
571
+
572
+ def speech_rate(sound):
573
+ silencedb = -25
574
+ mindip = 2
575
+ minpause = 0.3
576
+ originaldur = sound.get_total_duration()
577
+ intensity = sound.to_intensity(50)
578
+ start = call(intensity, "Get time from frame number", 1)
579
+ nframes = call(intensity, "Get number of frames")
580
+ end = call(intensity, "Get time from frame number", nframes)
581
+ min_intensity = call(intensity, "Get minimum", 0, 0, "Parabolic")
582
+ max_intensity = call(intensity, "Get maximum", 0, 0, "Parabolic")
583
+
584
+ # get .99 quantile to get maximum (without influence of non-speech sound bursts)
585
+ max_99_intensity = call(intensity, "Get quantile", 0, 0, 0.99)
586
+
587
+ # estimate Intensity threshold
588
+ threshold = max_99_intensity + silencedb
589
+ threshold2 = max_intensity - max_99_intensity
590
+ threshold3 = silencedb - threshold2
591
+ if threshold < min_intensity:
592
+ threshold = min_intensity
593
+
594
+ # get pauses (silences) and speakingtime
595
+ textgrid = call(
596
+ intensity,
597
+ "To TextGrid (silences)",
598
+ threshold3,
599
+ minpause,
600
+ 0.1,
601
+ "silent",
602
+ "sounding",
603
+ )
604
+ silencetier = call(textgrid, "Extract tier", 1)
605
+ silencetable = call(silencetier, "Down to TableOfReal", "sounding")
606
+ npauses = call(silencetable, "Get number of rows")
607
+ speakingtot = 0
608
+ pause_durations = [] # Store individual pause durations
609
+
610
+ for ipause in range(npauses):
611
+ pause = ipause + 1
612
+ beginsound = call(silencetable, "Get value", pause, 1)
613
+ endsound = call(silencetable, "Get value", pause, 2)
614
+ speakingdur = endsound - beginsound
615
+ speakingtot += speakingdur
616
+
617
+ # Calculate pause duration (time between speaking segments)
618
+ if ipause > 0:
619
+ prev_pause = ipause
620
+ prev_endsound = call(silencetable, "Get value", prev_pause, 2)
621
+ pause_duration = beginsound - prev_endsound
622
+ if pause_duration > 0: # Only include positive pause durations
623
+ pause_durations.append(pause_duration)
624
+
625
+ # Calculate pause duration distribution parameters
626
+ pause_lognorm_mu = np.nan
627
+ pause_lognorm_sigma = np.nan
628
+ pause_lognorm_ks_pvalue = np.nan
629
+ pause_mean_duration = np.nan
630
+ pause_std_duration = np.nan
631
+ pause_cv = np.nan
632
+
633
+ if len(pause_durations) >= 3: # Need minimum samples for distribution fitting
634
+ try:
635
+ # Fit lognormal distribution to pause durations
636
+ pause_durations_array = np.array(pause_durations)
637
+
638
+ # Calculate basic statistics
639
+ pause_mean_duration = np.mean(pause_durations_array)
640
+ pause_std_duration = np.std(pause_durations_array)
641
+ pause_cv = (
642
+ pause_std_duration / pause_mean_duration
643
+ if pause_mean_duration > 0
644
+ else 0
645
+ )
646
+
647
+ # Fit lognormal distribution
648
+ shape, loc, scale = lognorm.fit(pause_durations_array, floc=0)
649
+ pause_lognorm_sigma = shape # shape parameter (sigma)
650
+ pause_lognorm_mu = np.log(scale) # location parameter (mu)
651
+
652
+ # Test goodness of fit using Kolmogorov-Smirnov test
653
+ ks_stat, pause_lognorm_ks_pvalue = stats.kstest(
654
+ pause_durations_array,
655
+ lambda x: lognorm.cdf(x, shape, loc=loc, scale=scale),
656
+ )
657
+
658
+ except (ValueError, RuntimeError) as e:
659
+ print(f"Error fitting lognormal distribution to pause durations: {e}")
660
+
661
+ # Calculate pause duration
662
+ pausetot = originaldur - speakingtot
663
+
664
+ intensity_matrix = call(intensity, "Down to Matrix")
665
+ # sndintid = sound_from_intensity_matrix
666
+ sound_from_intensity_matrix = call(intensity_matrix, "To Sound (slice)", 1)
667
+ # use total duration, not end time, to find out duration of intdur (intensity_duration)
668
+ # in order to allow nonzero starting times.
669
+ intensity_duration = call(sound_from_intensity_matrix, "Get total duration")
670
+ intensity_max = call(sound_from_intensity_matrix, "Get maximum", 0, 0, "Parabolic")
671
+ point_process = call(
672
+ sound_from_intensity_matrix,
673
+ "To PointProcess (extrema)",
674
+ "Left",
675
+ "yes",
676
+ "no",
677
+ "Sinc70",
678
+ )
679
+ # estimate peak positions (all peaks)
680
+ numpeaks = call(point_process, "Get number of points")
681
+ t = [call(point_process, "Get time from index", i + 1) for i in range(numpeaks)]
682
+
683
+ # fill array with intensity values
684
+ timepeaks = []
685
+ peakcount = 0
686
+ intensities = []
687
+ for i in range(numpeaks):
688
+ value = call(sound_from_intensity_matrix, "Get value at time", t[i], "Cubic")
689
+ if value > threshold:
690
+ peakcount += 1
691
+ intensities.append(value)
692
+ timepeaks.append(t[i])
693
+
694
+ # fill array with valid peaks: only intensity values if preceding
695
+ # dip in intensity is greater than mindip
696
+ validpeakcount = 0
697
+ currenttime = timepeaks[0] if timepeaks else 0
698
+ currentint = intensities[0] if intensities else 0
699
+ validtime = []
700
+
701
+ for p in range(peakcount - 1):
702
+ following = p + 1
703
+ followingtime = timepeaks[p + 1]
704
+ dip = call(intensity, "Get minimum", currenttime, timepeaks[p + 1], "None")
705
+ diffint = abs(currentint - dip)
706
+ if diffint > mindip:
707
+ validpeakcount += 1
708
+ validtime.append(timepeaks[p])
709
+ currenttime = timepeaks[following]
710
+ currentint = call(intensity, "Get value at time", timepeaks[following], "Cubic")
711
+
712
+ # Look for only voiced parts
713
+ pitch = sound.to_pitch_ac(0.02, 30, 4, False, 0.03, 0.25, 0.01, 0.35, 0.25, 450)
714
+ voicedcount = 0
715
+ voicedpeak = []
716
+
717
+ for time in range(validpeakcount):
718
+ querytime = validtime[time]
719
+ whichinterval = call(textgrid, "Get interval at time", 1, querytime)
720
+ whichlabel = call(textgrid, "Get label of interval", 1, whichinterval)
721
+ value = pitch.get_value_at_time(querytime)
722
+ if not math.isnan(value):
723
+ if whichlabel == "sounding":
724
+ voicedcount += 1
725
+ voicedpeak.append(validtime[time])
726
+
727
+ # calculate time correction due to shift in time for Sound object versus
728
+ # intensity object
729
+ timecorrection = originaldur / intensity_duration
730
+
731
+ # Insert voiced peaks in TextGrid
732
+ call(textgrid, "Insert point tier", 1, "syllables")
733
+ for i in range(len(voicedpeak)):
734
+ position = voicedpeak[i] * timecorrection
735
+ call(textgrid, "Insert point", 1, position, "")
736
+
737
+ # return results
738
+ speakingrate = voicedcount / originaldur
739
+ articulationrate = voicedcount / speakingtot if speakingtot > 0 else 0
740
+ npause = npauses - 1
741
+ try:
742
+ asd = speakingtot / voicedcount
743
+ except ZeroDivisionError:
744
+ asd = 0
745
+ print("caught zero division")
746
+
747
+ # Calculate proportion pause duration
748
+ try:
749
+ proportion_pause_duration = pausetot / speakingtot
750
+ except ZeroDivisionError:
751
+ proportion_pause_duration = 0
752
+ print("caught zero division for proportion pause duration")
753
+
754
+ speechrate_dictionary = {
755
+ "nsyll": voicedcount,
756
+ "npause": npause,
757
+ "phonationtime_s": intensity_duration,
758
+ "speechrate_nsyll_dur": speakingrate,
759
+ "articulation_rate_nsyll_phonationtime": articulationrate,
760
+ "ASD_speakingtime_nsyll": asd,
761
+ "proportion_pause_duration": proportion_pause_duration,
762
+ "pause_lognorm_mu": pause_lognorm_mu,
763
+ "pause_lognorm_sigma": pause_lognorm_sigma,
764
+ "pause_lognorm_ks_pvalue": pause_lognorm_ks_pvalue,
765
+ "pause_mean_duration": pause_mean_duration,
766
+ "pause_std_duration": pause_std_duration,
767
+ "pause_cv": pause_cv,
768
+ }
769
+ return speechrate_dictionary