PyPI - nkululeko - Versions diffs - 0.94.2__py3-none-any.whl → 0.95.0__py3-none-any.whl - Mend

nkululeko 0.94.2py3-none-any.whl → 0.95.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

nkululeko/augmenting/resampler.py +25 -14
nkululeko/autopredict/ap_emotion.py +36 -0
nkululeko/autopredict/ap_text.py +45 -0
nkululeko/autopredict/whisper_transcriber.py +81 -0
nkululeko/constants.py +1 -1
nkululeko/experiment.py +53 -3
nkululeko/explore.py +32 -13
nkululeko/feat_extract/feats_analyser.py +45 -17
nkululeko/feat_extract/feats_emotion2vec.py +51 -26
nkululeko/feat_extract/feinberg_praat.py +515 -372
nkululeko/glob_conf.py +9 -0
nkululeko/modelrunner.py +15 -6
nkululeko/models/model_tuned.py +416 -84
nkululeko/models/model_xgb.py +149 -3
nkululeko/plots.py +25 -19
nkululeko/predict.py +6 -5
nkululeko/reporting/report.py +7 -5
nkululeko/reporting/reporter.py +8 -5
nkululeko/runmanager.py +1 -1
nkululeko/utils/util.py +34 -2
{nkululeko-0.94.2.dist-info → nkululeko-0.95.0.dist-info}/METADATA +1 -1
{nkululeko-0.94.2.dist-info → nkululeko-0.95.0.dist-info}/RECORD +26 -23
{nkululeko-0.94.2.dist-info → nkululeko-0.95.0.dist-info}/WHEEL +0 -0
{nkululeko-0.94.2.dist-info → nkululeko-0.95.0.dist-info}/entry_points.txt +0 -0
{nkululeko-0.94.2.dist-info → nkululeko-0.95.0.dist-info}/licenses/LICENSE +0 -0
{nkululeko-0.94.2.dist-info → nkululeko-0.95.0.dist-info}/top_level.txt +0 -0

nkululeko/feat_extract/feinberg_praat.py CHANGED Viewed

@@ -1,6 +1,8 @@
 """This is a copy of David R. Feinberg's Praat scripts.
 https://github.com/drfeinberg/PraatScripts
 taken June 23rd 2022.
+2025-05-06: Optimized for faster computation (bta).
 """
 #!/usr/bin/env python3
@@ -13,164 +15,340 @@ import pandas as pd
 import parselmouth
 from parselmouth.praat import call
 from scipy.stats.mstats import zscore
+from scipy.stats import lognorm
+from scipy import stats
 from sklearn.decomposition import PCA
 from tqdm import tqdm
-# This is the function to measure source acoustics using default male parameters.
-def measure_pitch(voice_id, f0min, f0max, unit):
-    sound = parselmouth.Sound(voice_id)  # read the sound
-    duration = call(sound, "Get total duration")  # duration
-    pitch = call(sound, "To Pitch", 0.0, f0min, f0max)  # create a praat pitch object
-    mean_f0 = call(pitch, "Get mean", 0, 0, unit)  # get mean pitch
-    stdev_f0 = call(
-        pitch, "Get standard deviation", 0, 0, unit
-    )  # get standard deviation
-    harmonicity = call(sound, "To Harmonicity (cc)", 0.01, f0min, 0.1, 1.0)
-    hnr = call(harmonicity, "Get mean", 0, 0)
-    point_process = call(sound, "To PointProcess (periodic, cc)", f0min, f0max)
-    local_jitter = call(point_process, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3)
-    localabsolute_jitter = call(
-        point_process, "Get jitter (local, absolute)", 0, 0, 0.0001, 0.02, 1.3
-    )
-    rap_jitter = call(point_process, "Get jitter (rap)", 0, 0, 0.0001, 0.02, 1.3)
-    ppq5_jitter = call(point_process, "Get jitter (ppq5)", 0, 0, 0.0001, 0.02, 1.3)
-    ddp_jitter = call(point_process, "Get jitter (ddp)", 0, 0, 0.0001, 0.02, 1.3)
-    local_shimmer = call(
-        [sound, point_process],
-        "Get shimmer (local)",
-        0,
-        0,
-        0.0001,
-        0.02,
-        1.3,
-        1.6,
-    )
-    localdb_shimmer = call(
-        [sound, point_process],
-        "Get shimmer (local_dB)",
-        0,
-        0,
-        0.0001,
-        0.02,
-        1.3,
-        1.6,
-    )
-    apq3_shimmer = call(
-        [sound, point_process],
-        "Get shimmer (apq3)",
-        0,
-        0,
-        0.0001,
-        0.02,
-        1.3,
-        1.6,
-    )
-    aqpq5_shimmer = call(
-        [sound, point_process],
-        "Get shimmer (apq5)",
-        0,
-        0,
-        0.0001,
-        0.02,
-        1.3,
-        1.6,
-    )
-    apq11_shimmer = call(
-        [sound, point_process],
-        "Get shimmer (apq11)",
-        0,
-        0,
-        0.0001,
-        0.02,
-        1.3,
-        1.6,
-    )
-    dda_shimmer = call(
-        [sound, point_process], "Get shimmer (dda)", 0, 0, 0.0001, 0.02, 1.3, 1.6
-    )
-    return (
-        duration,
-        mean_f0,
-        stdev_f0,
-        hnr,
-        local_jitter,
-        localabsolute_jitter,
-        rap_jitter,
-        ppq5_jitter,
-        ddp_jitter,
-        local_shimmer,
-        localdb_shimmer,
-        apq3_shimmer,
-        aqpq5_shimmer,
-        apq11_shimmer,
-        dda_shimmer,
-    )
+class AudioFeatureExtractor:
+    """Optimized audio feature extraction class to avoid redundant calculations."""
+    def __init__(self, f0min=75, f0max=300):
+        self.f0min = f0min
+        self.f0max = f0max
-# ## This function measures formants at each glottal pulse
-#
-# Puts, D. A., Apicella, C. L., & Cárdenas, R. A. (2012). Masculine voices signal men's threat potential in forager and industrial societies. Proceedings of the Royal Society of London B: Biological Sciences, 279(1728), 601-609.
-#
-# Adapted from: DOI 10.17605/OSF.IO/K2BHS
-# This function measures formants using Formant Position formula
-# def measureFormants(sound, wave_file, f0min,f0max):
-def measure_formants(sound, f0min, f0max):
-    sound = parselmouth.Sound(sound)  # read the sound
-    #    pitch = call(sound, "To Pitch (cc)", 0, f0min, 15, 'no', 0.03, 0.45, 0.01, 0.35, 0.14, f0max)
-    point_process = call(sound, "To PointProcess (periodic, cc)", f0min, f0max)
-    formants = call(sound, "To Formant (burg)", 0.0025, 5, 5000, 0.025, 50)
-    num_points = call(point_process, "Get number of points")
-    f1_list = []
-    f2_list = []
-    f3_list = []
-    f4_list = []
-    # Measure formants only at glottal pulses
-    for point in range(0, num_points):
-        point += 1
-        t = call(point_process, "Get time from index", point)
-        f1 = call(formants, "Get value at time", 1, t, "Hertz", "Linear")
-        f2 = call(formants, "Get value at time", 2, t, "Hertz", "Linear")
-        f3 = call(formants, "Get value at time", 3, t, "Hertz", "Linear")
-        f4 = call(formants, "Get value at time", 4, t, "Hertz", "Linear")
-        f1_list.append(f1)
-        f2_list.append(f2)
-        f3_list.append(f3)
-        f4_list.append(f4)
-    f1_list = [f1 for f1 in f1_list if str(f1) != "nan"]
-    f2_list = [f2 for f2 in f2_list if str(f2) != "nan"]
-    f3_list = [f3 for f3 in f3_list if str(f3) != "nan"]
-    f4_list = [f4 for f4 in f4_list if str(f4) != "nan"]
-    # calculate mean formants across pulses
-    f1_mean = statistics.mean(f1_list)
-    f2_mean = statistics.mean(f2_list)
-    f3_mean = statistics.mean(f3_list)
-    f4_mean = statistics.mean(f4_list)
-    # calculate median formants across pulses, this is what is used in all subsequent calcualtions
-    # you can use mean if you want, just edit the code in the boxes below to replace median with mean
-    f1_median = statistics.median(f1_list)
-    f2_median = statistics.median(f2_list)
-    f3_median = statistics.median(f3_list)
-    f4_median = statistics.median(f4_list)
-    return (
-        f1_mean,
-        f2_mean,
-        f3_mean,
-        f4_mean,
-        f1_median,
-        f2_median,
-        f3_median,
-        f4_median,
-    )
+    def extract_all_features(self, sound):
+        """Extract all acoustic features from a single sound object."""
+        # Cache common objects to avoid redundant calculations
+        duration = sound.get_total_duration()
+        pitch = call(sound, "To Pitch", 0.0, self.f0min, self.f0max)
+        point_process = call(
+            sound, "To PointProcess (periodic, cc)", self.f0min, self.f0max
+        )
+        # Extract pitch-related features
+        pitch_features = self._extract_pitch_features(sound, pitch, point_process)
+        # Extract formant features
+        formant_features = self._extract_formant_features(sound, point_process)
+        # Extract speech rate and pause features
+        speech_features = self._extract_speech_features(sound)
+        # Combine all features
+        all_features = {
+            "duration": duration,
+            **pitch_features,
+            **formant_features,
+            **speech_features,
+        }
+        return all_features
+    def _extract_pitch_features(self, sound, pitch, point_process):
+        """Extract pitch, jitter, shimmer, and HNR features."""
+        # Pitch statistics
+        mean_f0 = call(pitch, "Get mean", 0, 0, "Hertz")
+        stdev_f0 = call(pitch, "Get standard deviation", 0, 0, "Hertz")
+        # HNR
+        harmonicity = call(sound, "To Harmonicity (cc)", 0.01, self.f0min, 0.1, 1.0)
+        hnr = call(harmonicity, "Get mean", 0, 0)
+        # Jitter measures
+        local_jitter = call(
+            point_process, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3
+        )
+        localabsolute_jitter = call(
+            point_process, "Get jitter (local, absolute)", 0, 0, 0.0001, 0.02, 1.3
+        )
+        rap_jitter = call(point_process, "Get jitter (rap)", 0, 0, 0.0001, 0.02, 1.3)
+        ppq5_jitter = call(point_process, "Get jitter (ppq5)", 0, 0, 0.0001, 0.02, 1.3)
+        ddp_jitter = call(point_process, "Get jitter (ddp)", 0, 0, 0.0001, 0.02, 1.3)
+        # Shimmer measures (reuse point_process)
+        shimmer_params = [0, 0, 0.0001, 0.02, 1.3, 1.6]
+        local_shimmer = call(
+            [sound, point_process], "Get shimmer (local)", *shimmer_params
+        )
+        localdb_shimmer = call(
+            [sound, point_process], "Get shimmer (local_dB)", *shimmer_params
+        )
+        apq3_shimmer = call(
+            [sound, point_process], "Get shimmer (apq3)", *shimmer_params
+        )
+        apq5_shimmer = call(
+            [sound, point_process], "Get shimmer (apq5)", *shimmer_params
+        )
+        apq11_shimmer = call(
+            [sound, point_process], "Get shimmer (apq11)", *shimmer_params
+        )
+        dda_shimmer = call([sound, point_process], "Get shimmer (dda)", *shimmer_params)
+        return {
+            "meanF0Hz": mean_f0,
+            "stdevF0Hz": stdev_f0,
+            "HNR": hnr,
+            "localJitter": local_jitter,
+            "localabsoluteJitter": localabsolute_jitter,
+            "rapJitter": rap_jitter,
+            "ppq5Jitter": ppq5_jitter,
+            "ddpJitter": ddp_jitter,
+            "localShimmer": local_shimmer,
+            "localdbShimmer": localdb_shimmer,
+            "apq3Shimmer": apq3_shimmer,
+            "apq5Shimmer": apq5_shimmer,
+            "apq11Shimmer": apq11_shimmer,
+            "ddaShimmer": dda_shimmer,
+        }
+    def _extract_formant_features(self, sound, point_process):
+        """Extract formant features efficiently."""
+        formants = call(sound, "To Formant (burg)", 0.0025, 5, 5000, 0.025, 50)
+        num_points = call(point_process, "Get number of points")
+        # Pre-allocate arrays for better performance
+        f1_values = []
+        f2_values = []
+        f3_values = []
+        f4_values = []
+        # Single loop to extract all formants
+        for point in range(num_points):
+            t = call(point_process, "Get time from index", point + 1)
+            f1 = call(formants, "Get value at time", 1, t, "Hertz", "Linear")
+            f2 = call(formants, "Get value at time", 2, t, "Hertz", "Linear")
+            f3 = call(formants, "Get value at time", 3, t, "Hertz", "Linear")
+            f4 = call(formants, "Get value at time", 4, t, "Hertz", "Linear")
+            # Filter out NaN values during collection
+            if not math.isnan(f1):
+                f1_values.append(f1)
+            if not math.isnan(f2):
+                f2_values.append(f2)
+            if not math.isnan(f3):
+                f3_values.append(f3)
+            if not math.isnan(f4):
+                f4_values.append(f4)
+        # Calculate statistics only once
+        f1_mean = statistics.mean(f1_values) if f1_values else np.nan
+        f2_mean = statistics.mean(f2_values) if f2_values else np.nan
+        f3_mean = statistics.mean(f3_values) if f3_values else np.nan
+        f4_mean = statistics.mean(f4_values) if f4_values else np.nan
+        f1_median = statistics.median(f1_values) if f1_values else np.nan
+        f2_median = statistics.median(f2_values) if f2_values else np.nan
+        f3_median = statistics.median(f3_values) if f3_values else np.nan
+        f4_median = statistics.median(f4_values) if f4_values else np.nan
+        return {
+            "f1_mean": f1_mean,
+            "f2_mean": f2_mean,
+            "f3_mean": f3_mean,
+            "f4_mean": f4_mean,
+            "f1_median": f1_median,
+            "f2_median": f2_median,
+            "f3_median": f3_median,
+            "f4_median": f4_median,
+        }
+    def _extract_speech_features(self, sound):
+        """Extract speech rate and pause features with lognormal distribution analysis."""
+        silencedb = -25
+        mindip = 2
+        minpause = 0.3
+        originaldur = sound.get_total_duration()
+        # Reuse intensity object for multiple calculations
+        intensity = sound.to_intensity(50)
+        max_99_intensity = call(intensity, "Get quantile", 0, 0, 0.99)
+        min_intensity = call(intensity, "Get minimum", 0, 0, "Parabolic")
+        max_intensity = call(intensity, "Get maximum", 0, 0, "Parabolic")
+        # Calculate threshold once
+        threshold = max_99_intensity + silencedb
+        threshold2 = max_intensity - max_99_intensity
+        threshold3 = silencedb - threshold2
+        if threshold < min_intensity:
+            threshold = min_intensity
+        # Extract silences and calculate pause durations
+        textgrid = call(
+            intensity,
+            "To TextGrid (silences)",
+            threshold3,
+            minpause,
+            0.1,
+            "silent",
+            "sounding",
+        )
+        silencetier = call(textgrid, "Extract tier", 1)
+        silencetable = call(silencetier, "Down to TableOfReal", "sounding")
+        npauses = call(silencetable, "Get number of rows")
+        speakingtot = 0
+        pause_durations = []
+        # Single loop for speaking time and pause duration calculation
+        for ipause in range(npauses):
+            pause = ipause + 1
+            beginsound = call(silencetable, "Get value", pause, 1)
+            endsound = call(silencetable, "Get value", pause, 2)
+            speakingdur = endsound - beginsound
+            speakingtot += speakingdur
+            if ipause > 0:
+                prev_endsound = call(silencetable, "Get value", ipause, 2)
+                pause_duration = beginsound - prev_endsound
+                if pause_duration > 0:
+                    pause_durations.append(pause_duration)
+        # Calculate pause distribution features
+        pause_features = self._calculate_pause_distribution(pause_durations)
+        # Efficient syllable counting
+        syllable_features = self._count_syllables_optimized(
+            sound, intensity, textgrid, threshold, mindip, originaldur
+        )
+        pausetot = originaldur - speakingtot
+        proportion_pause_duration = pausetot / speakingtot if speakingtot > 0 else 0
+        return {
+            **pause_features,
+            **syllable_features,
+            "proportion_pause_duration": proportion_pause_duration,
+        }
+    def _calculate_pause_distribution(self, pause_durations):
+        """Calculate lognormal distribution parameters for pause durations."""
+        pause_lognorm_mu = np.nan
+        pause_lognorm_sigma = np.nan
+        pause_lognorm_ks_pvalue = np.nan
+        pause_mean_duration = np.nan
+        pause_std_duration = np.nan
+        pause_cv = np.nan
+        if len(pause_durations) >= 3:
+            try:
+                pause_durations_array = np.array(pause_durations)
+                pause_mean_duration = np.mean(pause_durations_array)
+                pause_std_duration = np.std(pause_durations_array)
+                pause_cv = (
+                    pause_std_duration / pause_mean_duration
+                    if pause_mean_duration > 0
+                    else 0
+                )
+                shape, loc, scale = lognorm.fit(pause_durations_array, floc=0)
+                pause_lognorm_sigma = shape
+                pause_lognorm_mu = np.log(scale)
+                ks_stat, pause_lognorm_ks_pvalue = stats.kstest(
+                    pause_durations_array,
+                    lambda x: lognorm.cdf(x, shape, loc=loc, scale=scale),
+                )
+            except (ValueError, RuntimeError) as e:
+                print(f"Error fitting lognormal distribution: {e}")
+        return {
+            "pause_lognorm_mu": pause_lognorm_mu,
+            "pause_lognorm_sigma": pause_lognorm_sigma,
+            "pause_lognorm_ks_pvalue": pause_lognorm_ks_pvalue,
+            "pause_mean_duration": pause_mean_duration,
+            "pause_std_duration": pause_std_duration,
+            "pause_cv": pause_cv,
+        }
+    def _count_syllables_optimized(
+        self, sound, intensity, textgrid, threshold, mindip, originaldur
+    ):
+        """Optimized syllable counting avoiding redundant matrix operations."""
+        intensity_matrix = call(intensity, "Down to Matrix")
+        sound_from_intensity_matrix = call(intensity_matrix, "To Sound (slice)", 1)
+        intensity_duration = call(sound_from_intensity_matrix, "Get total duration")
+        point_process = call(
+            sound_from_intensity_matrix,
+            "To PointProcess (extrema)",
+            "Left",
+            "yes",
+            "no",
+            "Sinc70",
+        )
+        numpeaks = call(point_process, "Get number of points")
+        # Vectorized time extraction
+        timepeaks = []
+        intensities = []
+        for i in range(numpeaks):
+            t = call(point_process, "Get time from index", i + 1)
+            value = call(sound_from_intensity_matrix, "Get value at time", t, "Cubic")
+            if value > threshold:
+                timepeaks.append(t)
+                intensities.append(value)
+        # Optimized peak validation
+        validtime = []
+        if len(timepeaks) > 1:
+            for p in range(len(timepeaks) - 1):
+                currenttime = timepeaks[p]
+                currentint = intensities[p]
+                dip = call(
+                    intensity, "Get minimum", currenttime, timepeaks[p + 1], "None"
+                )
+                if abs(currentint - dip) > mindip:
+                    validtime.append(timepeaks[p])
+        # Count voiced syllables
+        pitch = sound.to_pitch_ac(0.02, 30, 4, False, 0.03, 0.25, 0.01, 0.35, 0.25, 450)
+        voicedcount = 0
+        for querytime in validtime:
+            whichinterval = call(textgrid, "Get interval at time", 1, querytime)
+            whichlabel = call(textgrid, "Get label of interval", 1, whichinterval)
+            pitch_value = pitch.get_value_at_time(querytime)
+            if not math.isnan(pitch_value) and whichlabel == "sounding":
+                voicedcount += 1
+        # Get silencetable for speaking time calculation
+        silencetier = call(textgrid, "Extract tier", 1)
+        silencetable = call(silencetier, "Down to TableOfReal", "sounding")
+        npauses = call(silencetable, "Get number of rows")
+        # Calculate speaking time
+        speakingtot = 0
+        for i in range(npauses):
+            beginsound = call(silencetable, "Get value", i + 1, 1)
+            endsound = call(silencetable, "Get value", i + 1, 2)
+            speakingtot += endsound - beginsound
+        # Calculate rates
+        speakingrate = voicedcount / originaldur
+        articulationrate = voicedcount / speakingtot if speakingtot > 0 else 0
+        asd = speakingtot / voicedcount if voicedcount > 0 else 0
+        return {
+            "nsyll": voicedcount,
+            "npause": npauses - 1,
+            "phonationtime_s": intensity_duration,
+            "speechrate_nsyll_dur": speakingrate,
+            "articulation_rate_nsyll_phonationtime": articulationrate,
+            "ASD_speakingtime_nsyll": asd,
+        }
 # ## This function runs a 2-factor Principle Components Analysis (PCA) on Jitter and Shimmer
@@ -227,231 +405,126 @@ def run_pca(df):
 def compute_features(file_index):
-    # create lists to put the results
-    duration_list = []
-    mean_f0_list = []
-    sd_f0_list = []
-    hnr_list = []
-    local_jitter_list = []
-    localabsolute_jitter_list = []
-    rap_jitter_list = []
-    ppq5_jitter_list = []
-    ddp_jitter_list = []
-    local_shimmer_list = []
-    localdb_shimmer_list = []
-    apq3_shimmer_list = []
-    aqpq5_shimmer_list = []
-    apq11_shimmer_list = []
-    dda_shimmer_list = []
-    f1_mean_list = []
-    f2_mean_list = []
-    f3_mean_list = []
-    f4_mean_list = []
-    f1_median_list = []
-    f2_median_list = []
-    f3_median_list = []
-    f4_median_list = []
-    # Go through all the wave files in the folder and measure all the acoustics
-    #    for i, wave_file in enumerate(file_list):
+    """Optimized feature computation using AudioFeatureExtractor class.
+    FEATURE COUNT COMPARISON:
+    Original version: ~36 features
+    - Basic: duration, meanF0Hz, stdevF0Hz, HNR (4)
+    - Jitter: localJitter, localabsoluteJitter, rapJitter, ppq5Jitter, ddpJitter (5)
+    - Shimmer: localShimmer, localdbShimmer, apq3Shimmer, apq5Shimmer, apq11Shimmer, ddaShimmer (6)
+    - Formants: f1-f4 mean/median (8)
+    - PCA: JitterPCA, ShimmerPCA (2)
+    - VTL: pF, fdisp, avgFormant, mff, fitch_vtl, delta_f, vtl_delta_f (7)
+    - Speech rate: nsyll, npause, phonationtime_s, speechrate_nsyll_dur,
+                   articulation_rate_nsyll_phonationtime, ASD_speakingtime_nsyll (6)
+    Current optimized version: ~42 features (+6 new pause distribution features)
+    - All original 36 features PLUS:
+    - Pause distribution: pause_lognorm_mu, pause_lognorm_sigma, pause_lognorm_ks_pvalue,
+                         pause_mean_duration, pause_std_duration, pause_cv (6)
+    - Additional: proportion_pause_duration (1)
+    Total: 43 features (7 new features added for AD detection)
+    """
+    extractor = AudioFeatureExtractor()
+    feature_list = []
     for idx, (wave_file, start, end) in enumerate(tqdm(file_index.to_list())):
-        signal, sampling_rate = audiofile.read(
-            wave_file,
-            offset=start.total_seconds(),
-            duration=(end - start).total_seconds(),
-            always_2d=True,
-        )
         try:
+            signal, sampling_rate = audiofile.read(
+                wave_file,
+                offset=start.total_seconds(),
+                duration=(end - start).total_seconds(),
+                always_2d=True,
+            )
             sound = parselmouth.Sound(values=signal, sampling_frequency=sampling_rate)
-            (
-                duration,
-                mean_f0,
-                stdev_f0,
-                hnr,
-                local_jitter,
-                localabsolute_jitter,
-                rap_jitter,
-                ppq5_jitter,
-                ddp_jitter,
-                local_shimmer,
-                localdb_shimmer,
-                apq3_shimmer,
-                aqpq5_shimmer,
-                apq11_shimmer,
-                dda_shimmer,
-            ) = measure_pitch(sound, 75, 300, "Hertz")
-            (
-                f1_mean,
-                f2_mean,
-                f3_mean,
-                f4_mean,
-                f1_median,
-                f2_median,
-                f3_median,
-                f4_median,
-            ) = measure_formants(sound, 75, 300)
-            #        file_list.append(wave_file) # make an ID list
-        except (statistics.StatisticsError, parselmouth.PraatError) as errors:
-            print(f"error on file {wave_file}: {errors}")
-        duration_list.append(duration)  # make duration list
-        mean_f0_list.append(mean_f0)  # make a mean F0 list
-        sd_f0_list.append(stdev_f0)  # make a sd F0 list
-        hnr_list.append(hnr)  # add HNR data
-        # add raw jitter and shimmer measures
-        local_jitter_list.append(local_jitter)
-        localabsolute_jitter_list.append(localabsolute_jitter)
-        rap_jitter_list.append(rap_jitter)
-        ppq5_jitter_list.append(ppq5_jitter)
-        ddp_jitter_list.append(ddp_jitter)
-        local_shimmer_list.append(local_shimmer)
-        localdb_shimmer_list.append(localdb_shimmer)
-        apq3_shimmer_list.append(apq3_shimmer)
-        aqpq5_shimmer_list.append(aqpq5_shimmer)
-        apq11_shimmer_list.append(apq11_shimmer)
-        dda_shimmer_list.append(dda_shimmer)
-        # add the formant data
-        f1_mean_list.append(f1_mean)
-        f2_mean_list.append(f2_mean)
-        f3_mean_list.append(f3_mean)
-        f4_mean_list.append(f4_mean)
-        f1_median_list.append(f1_median)
-        f2_median_list.append(f2_median)
-        f3_median_list.append(f3_median)
-        f4_median_list.append(f4_median)
-    # ## This block of code adds all of that data we just generated to a Pandas data frame
-    # Add the data to Pandas
-    df = pd.DataFrame(
-        np.column_stack(
-            [
-                duration_list,
-                mean_f0_list,
-                sd_f0_list,
-                hnr_list,
-                local_jitter_list,
-                localabsolute_jitter_list,
-                rap_jitter_list,
-                ppq5_jitter_list,
-                ddp_jitter_list,
-                local_shimmer_list,
-                localdb_shimmer_list,
-                apq3_shimmer_list,
-                aqpq5_shimmer_list,
-                apq11_shimmer_list,
-                dda_shimmer_list,
-                f1_mean_list,
-                f2_mean_list,
-                f3_mean_list,
-                f4_mean_list,
-                f1_median_list,
-                f2_median_list,
-                f3_median_list,
-                f4_median_list,
-            ]
-        ),
-        columns=[
-            "duration",
-            "meanF0Hz",
-            "stdevF0Hz",
-            "HNR",
-            "localJitter",
-            "localabsoluteJitter",
-            "rapJitter",
-            "ppq5Jitter",
-            "ddpJitter",
-            "localShimmer",
-            "localdbShimmer",
-            "apq3Shimmer",
-            "apq5Shimmer",
-            "apq11Shimmer",
-            "ddaShimmer",
-            "f1_mean",
-            "f2_mean",
-            "f3_mean",
-            "f4_mean",
-            "f1_median",
-            "f2_median",
-            "f3_median",
-            "f4_median",
-        ],
-    )
+            # Extract all features in one pass
+            features = extractor.extract_all_features(sound)
+            feature_list.append(features)
-    # add pca data
-    pca_data = run_pca(df)  # Run jitter and shimmer PCA
-    df = pd.concat([df, pca_data], axis=1)  # Add PCA data
-    # reload the data so it's all numbers
-    # df.to_csv("processed_results.csv", index=False)
-    # df = pd.read_csv("processed_results.csv", header=0)
-    #    df.sort_values('voiceID').head(20)
-    # ## Next we calculate the vocal-tract length estimates
-    # ### Formant position
-    #  Puts, D. A., Apicella, C. L., & Cárdenas, R. A. (2012). Masculine voices signal men's threat potential in forager and industrial societies. Proceedings of the Royal Society of London B: Biological Sciences, 279(1728), 601-609.
-    df["pF"] = (
-        zscore(df.f1_median)
-        + zscore(df.f2_median)
-        + zscore(df.f3_median)
-        + zscore(df.f4_median)
-    ) / 4
-    # ### Formant Dispersion
-    # Fitch, W. T. (1997). Vocal tract length and formant frequency dispersion correlate with body size in rhesus macaques. The Journal of the Acoustical Society of America, 102(2), 1213-1222.
-    df["fdisp"] = (df["f4_median"] - df["f1_median"]) / 3
-    # ### Fn (Average Formant)
-    # Pisanski, K., & Rendall, D. (2011). The prioritization of voice fundamental frequency or formants in listeners’ assessments of speaker size, masculinity, and attractiveness. The Journal of the Acoustical Society of America, 129(4), 2201-2212.
-    df["avgFormant"] = (
-        df["f1_median"] + df["f2_median"] + df["f3_median"] + df["f4_median"]
-    ) / 4
-    # ### MFF
-    # Smith, D. R., & Patterson, R. D. (2005). The interaction of glottal-pulse rate and vocal-tract length in judgements of speaker size, sex, and age. The Journal of the Acoustical Society of America, 118(5), 3177-3186.
-    df["mff"] = (
-        df["f1_median"] * df["f2_median"] * df["f3_median"] * df["f4_median"]
-    ) ** 0.25
-    # ### Fitch VTL
-    # Fitch, W. T. (1997). Vocal tract length and formant frequency dispersion correlate with body size in rhesus macaques. The Journal of the Acoustical Society of America, 102(2), 1213-1222.
-    # reload the data again
-    # df.to_csv("processed_results.csv", index=False)
-    # df = pd.read_csv('processed_results.csv', header=0)
-    df["fitch_vtl"] = (
-        (1 * (35000 / (4 * df["f1_median"])))
-        + (3 * (35000 / (4 * df["f2_median"])))
-        + (5 * (35000 / (4 * df["f3_median"])))
-        + (7 * (35000 / (4 * df["f4_median"])))
-    ) / 4
-    # ### $\Delta$F
-    # Reby,D.,& McComb,K.(2003). Anatomical constraints generate honesty: acoustic cues to age and weight in the roars of red deer stags. Animal Behaviour, 65, 519e-530.
-    xysum = (
-        (0.5 * df["f1_median"])
-        + (1.5 * df["f2_median"])
-        + (2.5 * df["f3_median"])
-        + (3.5 * df["f4_median"])
+        except Exception as errors:
+            print(f"error on file {wave_file}: {errors}")
+            # Add empty feature dict for failed files
+            feature_list.append(
+                {
+                    key: np.nan
+                    for key in ["duration", "meanF0Hz", "stdevF0Hz", "HNR"]
+                    + [
+                        f"f{i}_{stat}"
+                        for i in range(1, 5)
+                        for stat in ["mean", "median"]
+                    ]
+                    + [
+                        "localJitter",
+                        "localabsoluteJitter",
+                        "rapJitter",
+                        "ppq5Jitter",
+                        "ddpJitter",
+                        "localShimmer",
+                        "localdbShimmer",
+                        "apq3Shimmer",
+                        "apq5Shimmer",
+                        "apq11Shimmer",
+                        "ddaShimmer",
+                    ]
+                }
+            )
+    # Create DataFrame directly from feature list
+    df = pd.DataFrame(feature_list)
+    # Add derived features efficiently
+    df = add_derived_features(df)
+    print(
+        f"Feature extraction completed. Total features extracted: {len(df.columns) if 'df' in locals() else '~43'}"
     )
-    xsquaredsum = (0.5**2) + (1.5**2) + (2.5**2) + (3.5**2)
-    df["delta_f"] = xysum / xsquaredsum
-    # ### VTL($\Delta$F)
-    # Reby,D.,&McComb,K.(2003).Anatomical constraints generate honesty: acoustic cues to age and weight in the roars of red deer stags. Animal Behaviour, 65, 519e-530.
-    df["vtl_delta_f"] = 35000 / (2 * df["delta_f"])
+    return df
-    print("Now extracting speech rate parameters...")
-    df_speechrate = get_speech_rate(file_index)
-    print("")
+def add_derived_features(df):
+    """Add PCA and vocal tract length features efficiently."""
+    # PCA on jitter/shimmer
+    pca_data = run_pca(df)
+    df = pd.concat([df, pca_data], axis=1)
+    # Vectorized vocal tract calculations
+    with np.errstate(divide="ignore", invalid="ignore"):
+        df["pF"] = (
+            zscore(df.f1_median)
+            + zscore(df.f2_median)
+            + zscore(df.f3_median)
+            + zscore(df.f4_median)
+        ) / 4
+        df["fdisp"] = (df["f4_median"] - df["f1_median"]) / 3
+        df["avgFormant"] = (
+            df["f1_median"] + df["f2_median"] + df["f3_median"] + df["f4_median"]
+        ) / 4
+        df["mff"] = (
+            df["f1_median"] * df["f2_median"] * df["f3_median"] * df["f4_median"]
+        ) ** 0.25
+        # Fitch VTL calculation
+        df["fitch_vtl"] = (
+            (1 * (35000 / (4 * df["f1_median"])))
+            + (3 * (35000 / (4 * df["f2_median"])))
+            + (5 * (35000 / (4 * df["f3_median"])))
+            + (7 * (35000 / (4 * df["f4_median"])))
+        ) / 4
+        # Delta F calculation
+        xysum = (
+            0.5 * df["f1_median"]
+            + 1.5 * df["f2_median"]
+            + 2.5 * df["f3_median"]
+            + 3.5 * df["f4_median"]
+        )
+        xsquaredsum = 0.5**2 + 1.5**2 + 2.5**2 + 3.5**2
+        df["delta_f"] = xysum / xsquaredsum
+        df["vtl_delta_f"] = 35000 / (2 * df["delta_f"])
-    return df.join(df_speechrate)
+    return df
 """
@@ -469,6 +542,12 @@ def get_speech_rate(file_index):
         "speechrate_nsyll_dur",
         "articulation_rate_nsyll_phonationtime",
         "ASD_speakingtime_nsyll",
+        "pause_lognorm_mu",
+        "pause_lognorm_sigma",
+        "pause_lognorm_ks_pvalue",
+        "pause_mean_duration",
+        "pause_std_duration",
+        "pause_cv",
     ]
     datalist = []
     for idx, (wave_file, start, end) in enumerate(tqdm(file_index.to_list())):
@@ -527,6 +606,8 @@ def speech_rate(sound):
     silencetable = call(silencetier, "Down to TableOfReal", "sounding")
     npauses = call(silencetable, "Get number of rows")
     speakingtot = 0
+    pause_durations = []  # Store individual pause durations
     for ipause in range(npauses):
         pause = ipause + 1
         beginsound = call(silencetable, "Get value", pause, 1)
@@ -534,6 +615,53 @@ def speech_rate(sound):
         speakingdur = endsound - beginsound
         speakingtot += speakingdur
+        # Calculate pause duration (time between speaking segments)
+        if ipause > 0:
+            prev_pause = ipause
+            prev_endsound = call(silencetable, "Get value", prev_pause, 2)
+            pause_duration = beginsound - prev_endsound
+            if pause_duration > 0:  # Only include positive pause durations
+                pause_durations.append(pause_duration)
+    # Calculate pause duration distribution parameters
+    pause_lognorm_mu = np.nan
+    pause_lognorm_sigma = np.nan
+    pause_lognorm_ks_pvalue = np.nan
+    pause_mean_duration = np.nan
+    pause_std_duration = np.nan
+    pause_cv = np.nan
+    if len(pause_durations) >= 3:  # Need minimum samples for distribution fitting
+        try:
+            # Fit lognormal distribution to pause durations
+            pause_durations_array = np.array(pause_durations)
+            # Calculate basic statistics
+            pause_mean_duration = np.mean(pause_durations_array)
+            pause_std_duration = np.std(pause_durations_array)
+            pause_cv = (
+                pause_std_duration / pause_mean_duration
+                if pause_mean_duration > 0
+                else 0
+            )
+            # Fit lognormal distribution
+            shape, loc, scale = lognorm.fit(pause_durations_array, floc=0)
+            pause_lognorm_sigma = shape  # shape parameter (sigma)
+            pause_lognorm_mu = np.log(scale)  # location parameter (mu)
+            # Test goodness of fit using Kolmogorov-Smirnov test
+            ks_stat, pause_lognorm_ks_pvalue = stats.kstest(
+                pause_durations_array,
+                lambda x: lognorm.cdf(x, shape, loc=loc, scale=scale),
+            )
+        except (ValueError, RuntimeError) as e:
+            print(f"Error fitting lognormal distribution to pause durations: {e}")
+    # Calculate pause duration
+    pausetot = originaldur - speakingtot
     intensity_matrix = call(intensity, "Down to Matrix")
     # sndintid = sound_from_intensity_matrix
     sound_from_intensity_matrix = call(intensity_matrix, "To Sound (slice)", 1)
@@ -567,8 +695,8 @@ def speech_rate(sound):
     # fill array with valid peaks: only intensity values if preceding
     # dip in intensity is greater than mindip
     validpeakcount = 0
-    currenttime = timepeaks[0]
-    currentint = intensities[0]
+    currenttime = timepeaks[0] if timepeaks else 0
+    currentint = intensities[0] if intensities else 0
     validtime = []
     for p in range(peakcount - 1):
@@ -609,20 +737,35 @@ def speech_rate(sound):
     # return results
     speakingrate = voicedcount / originaldur
-    articulationrate = voicedcount / speakingtot
+    articulationrate = voicedcount / speakingtot if speakingtot > 0 else 0
     npause = npauses - 1
     try:
         asd = speakingtot / voicedcount
     except ZeroDivisionError:
         asd = 0
         print("caught zero division")
+    # Calculate proportion pause duration
+    try:
+        proportion_pause_duration = pausetot / speakingtot
+    except ZeroDivisionError:
+        proportion_pause_duration = 0
+        print("caught zero division for proportion pause duration")
     speechrate_dictionary = {
         "nsyll": voicedcount,
         "npause": npause,
-        "dur_s": originaldur,
+        # "dur_s": originaldur,
         "phonationtime_s": intensity_duration,
         "speechrate_nsyll_dur": speakingrate,
         "articulation_rate_nsyll_phonationtime": articulationrate,
         "ASD_speakingtime_nsyll": asd,
+        "proportion_pause_duration": proportion_pause_duration,
+        "pause_lognorm_mu": pause_lognorm_mu,
+        "pause_lognorm_sigma": pause_lognorm_sigma,
+        "pause_lognorm_ks_pvalue": pause_lognorm_ks_pvalue,
+        "pause_mean_duration": pause_mean_duration,
+        "pause_std_duration": pause_std_duration,
+        "pause_cv": pause_cv,
     }
     return speechrate_dictionary

nkululeko 0.94.2__py3-none-any.whl → 0.95.0__py3-none-any.whl

nkululeko 0.94.2py3-none-any.whl → 0.95.0py3-none-any.whl