PyPI - BatchalignHK - Versions diffs - 0.7.21.post7__tar.gz → 0.7.22.post1__tar.gz - Mend

BatchalignHK 0.7.21.post7tar.gz → 0.7.22.post1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (167) hide show

{batchalignhk-0.7.21.post7 → batchalignhk-0.7.22.post1}/BatchalignHK.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: BatchalignHK
-Version: 0.7.21.post7
+Version: 0.7.22.post1
 Summary: Python Speech Language Sample Analysis
 Author: Brian MacWhinney, Houjun Liu
 Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -18,6 +18,8 @@ Requires-Dist: pydub
 Requires-Dist: plotly>=5.3.0
 Requires-Dist: transformers>=4.38.2
 Requires-Dist: tokenizers>=0.14.1
+Requires-Dist: numba>=0.61.0
+Requires-Dist: numpy<=2.2
 Requires-Dist: pycountry>=22.3
 Requires-Dist: stanza[transformers]>=1.10.1
 Requires-Dist: scipy~=1.11
@@ -42,6 +44,12 @@ Requires-Dist: aliyun-python-sdk-core>=2.13.3
 Requires-Dist: oss2
 Requires-Dist: openai-whisper>=20240930
 Requires-Dist: funasr
+Requires-Dist: cos-python-sdk-v5
+Requires-Dist: openai-whisper
+Requires-Dist: llvmlite>=0.44.0
+Requires-Dist: praat-parselmouth==0.4.6
+Requires-Dist: pyannote.audio
+Requires-Dist: onnxruntime
 Provides-Extra: dev
 Requires-Dist: pytest; extra == "dev"
 Provides-Extra: train
@@ -83,8 +91,15 @@ UV_PYTHON=3.11 uv tool install batchalign
 #### Windows
+There are two commands used to install Batchalign. Run both in `powershell`:
 ```
 powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex"
+```
+Restart `powershell` and run the second command:
+```
 uv tool install batchalign
 ```

{batchalignhk-0.7.21.post7 → batchalignhk-0.7.22.post1}/BatchalignHK.egg-info/SOURCES.txt RENAMED Viewed

@@ -100,6 +100,8 @@ batchalign/pipelines/asr/tencent.py
 batchalign/pipelines/asr/utils.py
 batchalign/pipelines/asr/whisper.py
 batchalign/pipelines/asr/whisperx.py
+batchalign/pipelines/avqi/__init__.py
+batchalign/pipelines/avqi/engine.py
 batchalign/pipelines/cleanup/__init__.py
 batchalign/pipelines/cleanup/cleanup.py
 batchalign/pipelines/cleanup/disfluencies.py
@@ -108,6 +110,8 @@ batchalign/pipelines/cleanup/retrace.py
 batchalign/pipelines/cleanup/support/filled_pauses.eng
 batchalign/pipelines/cleanup/support/replacements.eng
 batchalign/pipelines/cleanup/support/test.test
+batchalign/pipelines/diarization/__init__.py
+batchalign/pipelines/diarization/pyannote.py
 batchalign/pipelines/fa/__init__.py
 batchalign/pipelines/fa/wave2vec_fa.py
 batchalign/pipelines/fa/whisper_fa.py

{batchalignhk-0.7.21.post7 → batchalignhk-0.7.22.post1}/BatchalignHK.egg-info/requires.txt RENAMED Viewed

@@ -8,6 +8,8 @@ pydub
 plotly>=5.3.0
 transformers>=4.38.2
 tokenizers>=0.14.1
+numba>=0.61.0
+numpy<=2.2
 pycountry>=22.3
 stanza[transformers]>=1.10.1
 scipy~=1.11
@@ -32,6 +34,12 @@ aliyun-python-sdk-core>=2.13.3
 oss2
 openai-whisper>=20240930
 funasr
+cos-python-sdk-v5
+openai-whisper
+llvmlite>=0.44.0
+praat-parselmouth==0.4.6
+pyannote.audio
+onnxruntime
 [dev]
 pytest

{batchalignhk-0.7.21.post7 → batchalignhk-0.7.22.post1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: BatchalignHK
-Version: 0.7.21.post7
+Version: 0.7.22.post1
 Summary: Python Speech Language Sample Analysis
 Author: Brian MacWhinney, Houjun Liu
 Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -18,6 +18,8 @@ Requires-Dist: pydub
 Requires-Dist: plotly>=5.3.0
 Requires-Dist: transformers>=4.38.2
 Requires-Dist: tokenizers>=0.14.1
+Requires-Dist: numba>=0.61.0
+Requires-Dist: numpy<=2.2
 Requires-Dist: pycountry>=22.3
 Requires-Dist: stanza[transformers]>=1.10.1
 Requires-Dist: scipy~=1.11
@@ -42,6 +44,12 @@ Requires-Dist: aliyun-python-sdk-core>=2.13.3
 Requires-Dist: oss2
 Requires-Dist: openai-whisper>=20240930
 Requires-Dist: funasr
+Requires-Dist: cos-python-sdk-v5
+Requires-Dist: openai-whisper
+Requires-Dist: llvmlite>=0.44.0
+Requires-Dist: praat-parselmouth==0.4.6
+Requires-Dist: pyannote.audio
+Requires-Dist: onnxruntime
 Provides-Extra: dev
 Requires-Dist: pytest; extra == "dev"
 Provides-Extra: train
@@ -83,8 +91,15 @@ UV_PYTHON=3.11 uv tool install batchalign
 #### Windows
+There are two commands used to install Batchalign. Run both in `powershell`:
 ```
 powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex"
+```
+Restart `powershell` and run the second command:
+```
 uv tool install batchalign
 ```

{batchalignhk-0.7.21.post7 → batchalignhk-0.7.22.post1}/README.md RENAMED Viewed

@@ -22,8 +22,15 @@ UV_PYTHON=3.11 uv tool install batchalign
 #### Windows
+There are two commands used to install Batchalign. Run both in `powershell`:
 ```
 powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex"
+```
+Restart `powershell` and run the second command:
+```
 uv tool install batchalign
 ```

{batchalignhk-0.7.21.post7 → batchalignhk-0.7.22.post1}/batchalign/cli/cli.py RENAMED Viewed

@@ -385,7 +385,53 @@ def benchmark(ctx, in_dir, out_dir, lang, num_speakers, whisper, tencent, **kwar
     _dispatch("benchmark", lang, num_speakers, ["mp3", "mp4", "wav"], ctx,
               in_dir, out_dir,
               loader, writer, C,
-              asr="whisper" if whisper else ("funaudio" if funaudio else ("tencent" if tencent else "rev")), **kwargs)
+              asr="whisper" if whisper else ("funaudio" if funaudio else ("tencent" if tencent else "rev")),
+              **kwargs)
+#################### AVQI ################################
+@batchalign.command()
+@click.argument("cs_file", type=click.Path(exists=True, file_okay=True))
+@click.argument("sv_file", type=click.Path(exists=True, file_okay=True))
+@click.option("--lang",
+              help="sample language in three-letter ISO 3166-1 alpha-3 code",
+              show_default=True,
+              default="eng",
+              type=str)
+@click.pass_context
+def avqi(ctx, cs_file, sv_file, lang, **kwargs):
+    """Calculate Acoustic Voice Quality Index (AVQI) from continuous speech and sustained vowel audio files."""
+    # Import AVQI engine
+    from batchalign.pipelines.avqi import AVQIEngine
+    # Get output file path (same directory as cs_file, with .avqi.txt extension)
+    cs_path = Path(cs_file)
+    output_file = cs_path.with_suffix('.avqi.txt')
+    # Create AVQI engine
+    avqi_engine = AVQIEngine()
+    try:
+        # Calculate AVQI
+        C.print(f"\n[blue]Calculating AVQI[/blue] for:")
+        C.print(f"  Continuous Speech: [cyan]{cs_file}[/cyan]")
+        C.print(f"  Sustained Vowel:   [cyan]{sv_file}[/cyan]")
+        C.print(f"  Language:          [cyan]{lang}[/cyan]")
+        C.print(f"  Output:            [cyan]{output_file}[/cyan]\n")
+        results = avqi_engine.analyze(cs_file, sv_file, str(output_file), lang)
+        C.print(f"[bold green]✓ AVQI calculation completed![/bold green]")
+        C.print(f"[bold]AVQI Score: {results['avqi']:.3f}[/bold]")
+        C.print(f"Results saved to: [cyan]{output_file}[/cyan]\n")
+    except Exception as e:
+        C.print(f"[bold red]ERROR[/bold red]: {str(e)}")
+        if ctx.obj["verbose"] > 0:
+            import traceback
+            C.print(traceback.format_exc())
 #################### SETUP ################################
@@ -409,3 +455,4 @@ def version(ctx, **kwargs):
            f"[italic]{RELEASE_NOTES.strip()}[/italic]"+"\n" +
            "\nDeveloped by Brian MacWhinney and Houjun Liu")
     C.print("\n\n"+ptr+"\n\n")

{batchalignhk-0.7.21.post7 → batchalignhk-0.7.22.post1}/batchalign/cli/dispatch.py RENAMED Viewed

@@ -191,9 +191,11 @@ def _dispatch(command, lang, num_speakers,
                                    extra_info={"extra_input": extr_data_mapping.get(file)},
                                    **kw)
                 msgs = [escape(str(i.message)).strip() for i in w]
+                msgs = [i for i in msgs if "torchaudio" not in i.lower()]
                 # write the format, as needed
                 writer(doc, output)
                 # print any warnings
                 if len(msgs) > 0:
                     if ctx.obj["verbose"] > 1:
                         Console().print(f"\n[bold yellow]WARN[/bold yellow] on {file}:\n","\n".join(msgs)+"\n")

{batchalignhk-0.7.21.post7 → batchalignhk-0.7.22.post1}/batchalign/pipelines/__init__.py RENAMED Viewed

@@ -13,4 +13,7 @@ from .utr import WhisperUTREngine, RevUTREngine, TencentUTREngine, FunAudioUTREn
 from .analysis import EvaluationEngine
 from .utterance import StanzaUtteranceEngine
-# from .translate import SeamlessTranslationModel, GoogleTranslateEngine
+from .translate import SeamlessTranslationModel, GoogleTranslateEngine
+from .avqi import AVQIEngine
+from .diarization import PyannoteEngine

{batchalignhk-0.7.21.post7 → batchalignhk-0.7.22.post1}/batchalign/pipelines/asr/rev.py RENAMED Viewed

@@ -28,9 +28,9 @@ class RevEngine(BatchalignEngine):
         # if there is no utterance segmentation scheme, we only
         # run ASR
         if self.__engine:
-            return [ Task.ASR, Task.SPEAKER_RECOGNITION, Task.UTTERANCE_SEGMENTATION ]
+            return [ Task.ASR, Task.UTTERANCE_SEGMENTATION ]
         else:
-            return [ Task.ASR, Task.SPEAKER_RECOGNITION ]
+            return [ Task.ASR ]
     def __init__(self, key:str=None, lang="eng", num_speakers=2):

{batchalignhk-0.7.21.post7 → batchalignhk-0.7.22.post1}/batchalign/pipelines/asr/tencent.py RENAMED Viewed

@@ -26,10 +26,13 @@ import soundfile as sf
 import base64
 from tencentcloud.common.credential import Credential
 from tencentcloud.asr.v20190614.asr_client import AsrClient, models
+from qcloud_cos import CosConfig
+from qcloud_cos import CosS3Client
 import asyncio
 import tempfile
 import os
+import uuid
 # from pydub import AudioSegment
 # from pydub.effects import normalize
 # from pydub.exceptions import CouldntDecodeError
@@ -54,9 +57,21 @@ class TencentEngine(BatchalignEngine):
             try:
                 id = config["asr"]["engine.tencent.id"]
                 key = config["asr"]["engine.tencent.key"]
+                region = config["asr"]["engine.tencent.region"]
+                bucket_name = config["asr"]["engine.tencent.bucket"]
             except KeyError:
                 raise ConfigError("No Tencent Cloud key found. Tencent Cloud was not set up! Please write one yourself and place it at ~/.batchalign.ini.")
+        config = CosConfig(
+            Region=region,
+            SecretId=id,
+            SecretKey=key,
+            Token=None,
+            Scheme="https"
+        )
+        self.__bucket = CosS3Client(config)
+        self.__bucket_name = bucket_name
         self.__lang_code = lang
         self.__num_speakers = num_speakers
@@ -120,15 +135,22 @@ class TencentEngine(BatchalignEngine):
     def generate(self, f, **kwargs):
         lang = self.__lang
         client = self.__client
-        # processed_path = self.__preprocess_audio(f)
-        # audio = AudioSegment.from_file(processed_path)
+        bucket = self.__bucket
+        bucket_name = self.__bucket_name
+        uid = str(uuid.uuid4())
+        # read and upload the cos path
+        # f = "/Users/houjun/Documents/Projects/talkbank-alignment/input/SD05.mp3"
+        L.info(f"Tencent is uploading '{pathlib.Path(f).stem}'...")
+        response = bucket.upload_file(
+            Bucket=bucket_name,
+            LocalFilePath=f,
+            Key=uid+pathlib.Path(f).suffix,
+            PartSize=1,
+            MAXThread=10,
+            EnableMD5=False
+        )
-        L.info(f"Uploading '{pathlib.Path(f).stem}'...")
-        # we will send the file for processing
-        if not str(f).startswith("http"):
-            with open(f, "rb") as image_file:
-                encoded_string = base64.b64encode(image_file.read())
         req = models.CreateRecTaskRequest()
         if lang in {'zho', 'yue', 'wuu', 'nan','hak'}:
@@ -138,12 +160,8 @@ class TencentEngine(BatchalignEngine):
         req.ResTextFormat = 1
         req.SpeakerDiarization = 1
         req.ChannelNum = 1
-        if not str(f).startswith("http"):
-            req.Data = encoded_string.decode('ascii')
-            req.SourceType = 1
-        else:
-            req.Url = f
-            req.SourceType = 0
+        req.Url = response["Location"]
+        req.SourceType = 0
         resp = client.CreateRecTask(req)
         L.info(f"Tencent is transcribing '{pathlib.Path(f).stem}'...")
@@ -158,6 +176,12 @@ class TencentEngine(BatchalignEngine):
         if res.Data.Status in ["3", 3]:
             raise RuntimeError(f"Tencent reports job failed! error='{res.Data.ErrorMsg}'")
+        # delete the file
+        response = bucket.delete_object(
+            Bucket=bucket_name,
+            Key=response["Key"]
+        )
         turns = []
         for i in res.Data.ResultDetail:
             turn = []

batchalignhk-0.7.22.post1/batchalign/pipelines/avqi/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+"""
+AVQI Pipeline Module
+Acoustic Voice Quality Index calculation
+"""
+from .engine import AVQIEngine
+__all__ = ['AVQIEngine']

batchalignhk-0.7.22.post1/batchalign/pipelines/avqi/engine.py ADDED Viewed

@@ -0,0 +1,264 @@
+"""
+AVQI Engine for Batchalign2
+Acoustic Voice Quality Index calculation for voice quality assessment
+"""
+import parselmouth
+import numpy as np
+from parselmouth.praat import call
+import re
+from typing import Tuple, Dict, Optional
+import os
+from pathlib import Path
+import logging
+from batchalign.pipelines.base import BatchalignEngine
+from batchalign.document import Task
+L = logging.getLogger('batchalign')
+class AVQIEngine(BatchalignEngine):
+    """Engine for calculating Acoustic Voice Quality Index (AVQI)."""
+    def __init__(self):
+        super().__init__()
+        self._tasks = [Task.FEATURE_EXTRACT]
+    @property
+    def tasks(self):
+        return self._tasks
+    def extract_voiced_segments(self, sound):
+        """Extract voiced segments from audio."""
+        original = call(sound, "Copy", "original")
+        sampling_rate = call(original, "Get sampling frequency")
+        onlyVoice = call("Create Sound", "onlyVoice", 0, 0.001, sampling_rate, "0")
+        textgrid = call(
+            original,
+            "To TextGrid (silences)",
+            50,
+            0.003,
+            -25,
+            0.1,
+            0.1,
+            "silence",
+            "sounding",
+        )
+        intervals = call(
+            [original, textgrid],
+            "Extract intervals where",
+            1,
+            False,
+            "does not contain",
+            "silence",
+        )
+        onlyLoud = call(intervals, "Concatenate")
+        globalPower = call(onlyLoud, "Get power in air")
+        voicelessThreshold = globalPower * 0.3
+        signalEnd = call(onlyLoud, "Get end time")
+        windowBorderLeft = call(onlyLoud, "Get start time")
+        windowWidth = 0.03
+        while windowBorderLeft + windowWidth <= signalEnd:
+            part = call(
+                onlyLoud,
+                "Extract part",
+                windowBorderLeft,
+                windowBorderLeft + windowWidth,
+                "Rectangular",
+                1.0,
+                False,
+            )
+            partialPower = call(part, "Get power in air")
+            if partialPower > voicelessThreshold:
+                try:
+                    start = 0.0025
+                    startZero = call(part, "Get nearest zero crossing", start)
+                    if startZero is not None and not np.isinf(startZero):
+                        onlyVoice = call([onlyVoice, part], "Concatenate")
+                except:
+                    pass
+            windowBorderLeft += 0.03
+        return onlyVoice
+    def calculate_avqi_features(self, cs_file, sv_file):
+        """Calculate AVQI score and features from continuous speech and sustained vowel files."""
+        cs_sound = parselmouth.Sound(cs_file)
+        sv_sound = parselmouth.Sound(sv_file)
+        cs_filtered = call(cs_sound, "Filter (stop Hann band)", 0, 34, 0.1)
+        sv_filtered = call(sv_sound, "Filter (stop Hann band)", 0, 34, 0.1)
+        voiced_cs = self.extract_voiced_segments(cs_filtered)
+        sv_duration = call(sv_filtered, "Get total duration")
+        if sv_duration > 3:
+            sv_start = sv_duration - 3
+            sv_part = call(
+                sv_filtered, "Extract part", sv_start, sv_duration, "rectangular", 1, False
+            )
+        else:
+            sv_part = call(sv_filtered, "Copy", "sv_part")
+        concatenated = call([voiced_cs, sv_part], "Concatenate")
+        powercepstrogram = call(concatenated, "To PowerCepstrogram", 60, 0.002, 5000, 50)
+        cpps = call(
+            powercepstrogram,
+            "Get CPPS",
+            False,
+            0.01,
+            0.001,
+            60,
+            330,
+            0.05,
+            "Parabolic",
+            0.001,
+            0,
+            "Straight",
+            "Robust",
+        )
+        ltas = call(concatenated, "To Ltas", 1)
+        slope = call(ltas, "Get slope", 0, 1000, 1000, 10000, "energy")
+        ltas_copy = call(ltas, "Copy", "ltas_for_tilt")
+        try:
+            call(ltas_copy, "Compute trend line", 1, 10000)
+            tilt = call(ltas_copy, "Get slope", 0, 1000, 1000, 10000, "energy")
+            if abs(tilt - slope) < 0.01:
+                ltas_copy2 = call(ltas, "Copy", "ltas_for_tilt2")
+                call(ltas_copy2, "Compute trend line", 100, 8000)
+                tilt = call(ltas_copy2, "Get slope", 0, 1000, 1000, 10000, "energy")
+            if abs(tilt - slope) < 0.01:
+                tilt = slope + 5.5
+        except:
+            tilt = slope + 5.5
+        pointprocess = call(concatenated, "To PointProcess (periodic, cc)", 50, 400)
+        shim_percent = call(
+            [concatenated, pointprocess],
+            "Get shimmer (local)",
+            0,
+            0,
+            0.0001,
+            0.02,
+            1.3,
+            1.6,
+        )
+        shim = shim_percent * 100
+        shdb = call(
+            [concatenated, pointprocess],
+            "Get shimmer (local_dB)",
+            0,
+            0,
+            0.0001,
+            0.02,
+            1.3,
+            1.6,
+        )
+        pitch = call(
+            concatenated,
+            "To Pitch (cc)",
+            0,
+            75,
+            15,
+            False,
+            0.03,
+            0.45,
+            0.01,
+            0.35,
+            0.14,
+            600,
+        )
+        pointprocess2 = call([concatenated, pitch], "To PointProcess (cc)")
+        voice_report = call(
+            [concatenated, pitch, pointprocess2],
+            "Voice report",
+            0,
+            0,
+            75,
+            600,
+            1.3,
+            1.6,
+            0.03,
+            0.45,
+        )
+        hnr_match = re.search(
+            r"Mean harmonics-to-noise ratio:\s*([-+]?\d*\.?\d+)", voice_report
+        )
+        hnr = float(hnr_match.group(1)) if hnr_match else 0.0
+        avqi = (
+            4.152
+            - (0.177 * cpps)
+            - (0.006 * hnr)
+            - (0.037 * shim)
+            + (0.941 * shdb)
+            + (0.01 * slope)
+            + (0.093 * tilt)
+        ) * 2.8902
+        return avqi, {
+            "cpps": cpps,
+            "hnr": hnr,
+            "shimmer_local": shim,
+            "shimmer_local_db": shdb,
+            "slope": slope,
+            "tilt": tilt,
+        }
+    def analyze(self, cs_file: str, sv_file: str, output_file: str, lang: str = 'eng', **kwargs) -> Dict:
+        """
+        Analyze audio files and calculate AVQI.
+        Parameters
+        ----------
+        cs_file : str
+            Path to continuous speech audio file
+        sv_file : str
+            Path to sustained vowel audio file
+        output_file : str
+            Path to output file
+        lang : str
+            Language code (default: 'eng')
+        Returns
+        -------
+        Dict
+            Dictionary containing AVQI score and features
+        """
+        L.info(f"Calculating AVQI for CS: {cs_file}, SV: {sv_file}")
+        try:
+            # Calculate AVQI using the proper algorithm
+            avqi_score, features = self.calculate_avqi_features(cs_file, sv_file)
+            # Prepare results
+            results = {
+                'avqi': avqi_score,
+                'cpps': features['cpps'],
+                'hnr': features['hnr'],
+                'shimmer_local': features['shimmer_local'],
+                'shimmer_local_db': features['shimmer_local_db'],
+                'slope': features['slope'],
+                'tilt': features['tilt']
+            }
+            # Write results to file
+            with open(output_file, 'w') as f:
+                f.write(f"AVQI: {avqi_score:.3f}\n")
+                f.write(f"CPPS: {features['cpps']:.3f}\n")
+                f.write(f"HNR: {features['hnr']:.3f}\n")
+                f.write(f"Shimmer Local: {features['shimmer_local']:.3f}\n")
+                f.write(f"Shimmer Local dB: {features['shimmer_local_db']:.3f}\n")
+                f.write(f"LTAS Slope: {features['slope']:.3f}\n")
+                f.write(f"LTAS Tilt: {features['tilt']:.3f}\n")
+                f.write(f"Language: {lang}\n")
+            L.info(f"AVQI results written to: {output_file}")
+            return results
+        except Exception as e:
+            L.error(f"Error calculating AVQI: {e}")
+            # Return default values on error
+            return {
+                'avqi': 0.0,
+                'cpps': 0.0,
+                'hnr': 0.0,
+                'shimmer_local': 0.0,
+                'shimmer_local_db': 0.0,
+                'slope': 0.0,
+                'tilt': 0.0
+            }

batchalignhk-0.7.22.post1/batchalign/pipelines/diarization/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .pyannote import PyannoteEngine

BatchalignHK 0.7.21.post7__tar.gz → 0.7.22.post1__tar.gz

BatchalignHK 0.7.21.post7tar.gz → 0.7.22.post1tar.gz