PyPI - sonusai - Versions diffs - 0.19.9__py3-none-any.whl → 0.20.2__py3-none-any.whl - Mend

sonusai 0.19.9py3-none-any.whl → 0.20.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

sonusai/calc_metric_spenh.py +265 -233
sonusai/data/genmixdb.yml +4 -2
sonusai/data/silero_vad_v5.1.jit +0 -0
sonusai/data/silero_vad_v5.1.onnx +0 -0
sonusai/doc/doc.py +14 -0
sonusai/genft.py +1 -1
sonusai/genmetrics.py +15 -18
sonusai/genmix.py +1 -1
sonusai/genmixdb.py +30 -52
sonusai/ir_metric.py +555 -0
sonusai/metrics_summary.py +322 -0
sonusai/mixture/__init__.py +6 -2
sonusai/mixture/audio.py +139 -15
sonusai/mixture/augmentation.py +199 -84
sonusai/mixture/config.py +9 -4
sonusai/mixture/constants.py +0 -1
sonusai/mixture/datatypes.py +19 -10
sonusai/mixture/generation.py +52 -64
sonusai/mixture/helpers.py +38 -26
sonusai/mixture/ir_delay.py +63 -0
sonusai/mixture/mixdb.py +190 -46
sonusai/mixture/targets.py +3 -6
sonusai/mixture/truth_functions/energy.py +9 -5
sonusai/mixture/truth_functions/metrics.py +1 -1
sonusai/mkwav.py +1 -1
sonusai/onnx_predict.py +1 -1
sonusai/queries/queries.py +1 -1
sonusai/utils/__init__.py +2 -0
sonusai/utils/asr.py +1 -1
sonusai/utils/load_object.py +8 -2
sonusai/utils/stratified_shuffle_split.py +1 -1
sonusai/utils/temp_seed.py +13 -0
{sonusai-0.19.9.dist-info → sonusai-0.20.2.dist-info}/METADATA +2 -2
{sonusai-0.19.9.dist-info → sonusai-0.20.2.dist-info}/RECORD +36 -35
{sonusai-0.19.9.dist-info → sonusai-0.20.2.dist-info}/WHEEL +1 -1
sonusai/mixture/soundfile_audio.py +0 -130
sonusai/mixture/sox_audio.py +0 -476
sonusai/mixture/sox_augmentation.py +0 -136
sonusai/mixture/torchaudio_audio.py +0 -106
sonusai/mixture/torchaudio_augmentation.py +0 -109
{sonusai-0.19.9.dist-info → sonusai-0.20.2.dist-info}/entry_points.txt +0 -0

sonusai/calc_metric_spenh.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """sonusai calc_metric_spenh
-usage: calc_metric_spenh [-hvtpws] [-i MIXID] [-e ASR] [-m MODEL] [-n NCPU] PLOC TLOC
+usage: calc_metric_spenh [-hvtpws] [-i MIXID] [-e ASR] [-n NCPU] PLOC TLOC
 options:
     -h, --help
@@ -11,8 +11,10 @@ options:
     -w, --wav                   Generate WAV files per mixture.
     -s, --summary               Enable summary files generation.
     -n, --num_process NCPU      Number of parallel processes to use [default: auto]
-    -e ASR, --asr-method ASR    ASR method: deepgram, google, aixplain_whisper, whisper, or sensory. [default: none]
-    -m MODEL, --model           ASR model name used in some ASR methods. [default: tiny]
+    -e ASR, --asr-method ASR    ASR method used for WER metrics.  Must exist in the TLOC dataset as pre-calculated
+                                metrics using SonusAI genmetrics. Can be either an integer index, i.e 0,1,... or the
+                                name of the asr_engine configuration in the dataset.  If an incorrect name is specified,
+                                a list of asr_engines of the dataset will be printed.
 Calculate speech enhancement metrics of prediction data in PLOC using SonusAI mixture data in TLOC as truth/label
 reference. Metric and extraction data files are written into PLOC.
@@ -20,9 +22,14 @@ reference. Metric and extraction data files are written into PLOC.
 PLOC  directory containing prediction data in .h5 files created from truth/label mixture data in TLOC
 TLOC  directory with SonusAI mixture database of truth/label mixture data
-For whisper ASR methods, the possible models used in local processing (ASR = whisper) are:
-    {tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large}
-but note most are very computationally demanding and can overwhelm/hang a local system.
+For ASR methods, the method must bel2 defined in the TLOC dataset, for example possible fast_whisper available models are:
+{tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large} and an example configuration looks like:
+{'fwhsptiny_cpu': {'engine': 'faster_whisper',
+  'model': 'tiny',
+  'device': 'cpu',
+  'beam_size': 5}}
+Note: the ASR config can optionally include the model, device, and other fields the engine supports.
+Most ASR are very computationally demanding and can overwhelm/hang a local system.
 Outputs the following to PLOC (where id is mixid number 0:num_mixtures):
     <id>_metric_spenh.txt
@@ -61,8 +68,6 @@ Inputs:
 """
 import signal
-from contextlib import redirect_stdout
-from dataclasses import dataclass
 import matplotlib
 import matplotlib.pyplot as plt
@@ -93,24 +98,17 @@ signal.signal(signal.SIGINT, signal_handler)
 matplotlib.use("SVG")
-@dataclass
-class MPGlobal:
-    mixdb: MixtureDatabase
-    predict_location: str
-    predict_wav_mode: bool
-    truth_est_mode: bool
-    enable_plot: bool
-    enable_wav: bool
-    asr_method: str
-    asr_model_name: str
-MP_GLOBAL: MPGlobal
+def first_key(x: dict) -> str:
+    for key in x:
+        return key
+    raise KeyError("No key found")
 def mean_square_error(
-    hypothesis: np.ndarray, reference: np.ndarray, squared: bool = False
-) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    hypothesis: np.ndarray,
+    reference: np.ndarray,
+    squared: bool = False,
+) -> tuple[float, np.ndarray, np.ndarray]:
     """Calculate root-mean-square error or mean square error
     :param hypothesis: [frames, bins]
@@ -125,7 +123,7 @@ def mean_square_error(
     # mean over bins for value per frame
     err_f = np.mean(sq_err, axis=1)
     # mean over all
-    err = np.mean(sq_err)
+    err = float(np.mean(sq_err))
     if not squared:
         err_b = np.sqrt(err_b)
@@ -135,9 +133,7 @@ def mean_square_error(
     return err, err_b, err_f
-def mean_abs_percentage_error(
-    hypothesis: np.ndarray, reference: np.ndarray
-) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+def mean_abs_percentage_error(hypothesis: np.ndarray, reference: np.ndarray) -> tuple[float, np.ndarray, np.ndarray]:
     """Calculate mean abs percentage error
     If inputs are complex, calculates average: mape(real)/2 + mape(imag)/2
@@ -162,12 +158,12 @@ def mean_abs_percentage_error(
     # mean over bins for value per frame
     err_f = np.around(np.mean(abs_err, axis=1), 3)
     # mean over all
-    err = np.around(np.mean(abs_err), 3)
+    err = float(np.around(np.mean(abs_err), 3))
     return err, err_b, err_f
-def log_error(reference: np.ndarray, hypothesis: np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+def log_error(reference: np.ndarray, hypothesis: np.ndarray) -> tuple[float, np.ndarray, np.ndarray]:
     """Calculate log error
     :param reference: complex or real [frames, bins]
@@ -184,7 +180,7 @@ def log_error(reference: np.ndarray, hypothesis: np.ndarray) -> tuple[np.ndarray
     # mean over bins for value per frame
     err_f = np.around(np.mean(log_err, axis=1), 3)
     # mean over all
-    err = np.around(np.mean(log_err), 3)
+    err = float(np.around(np.mean(log_err), 3))
     return err, err_b, err_f
@@ -196,7 +192,7 @@ def plot_mixpred(
     feature: Feature | None = None,
     predict: Predict | None = None,
     tp_title: str = "",
-) -> plt.Figure:
+) -> plt.Figure:  # pyright: ignore [reportPrivateImportUsage]
     from sonusai.mixture import SAMPLE_RATE
     num_plots = 2
@@ -224,22 +220,12 @@ def plot_mixpred(
     if feature is not None:
         p += 1
-        ax[p].imshow(
-            np.transpose(feature),
-            aspect="auto",
-            interpolation="nearest",
-            origin="lower",
-        )
+        ax[p].imshow(np.transpose(feature), aspect="auto", interpolation="nearest", origin="lower")
         ax[p].set_title("Feature")
     if predict is not None:
         p += 1
-        im = ax[p].imshow(
-            np.transpose(predict),
-            aspect="auto",
-            interpolation="nearest",
-            origin="lower",
-        )
+        im = ax[p].imshow(np.transpose(predict), aspect="auto", interpolation="nearest", origin="lower")
         ax[p].set_title("Predict " + tp_title)
         plt.colorbar(im, location="bottom")
@@ -251,7 +237,7 @@ def plot_pdb_predict_truth(
     truth_f: np.ndarray | None = None,
     metric: np.ndarray | None = None,
     tp_title: str = "",
-) -> plt.Figure:
+) -> plt.Figure:  # pyright: ignore [reportPrivateImportUsage]
     """Plot predict and optionally truth and a metric in power db, e.g. applies 10*log10(predict)"""
     num_plots = 2
     if truth_f is not None:
@@ -277,24 +263,12 @@ def plot_pdb_predict_truth(
     pred_avg = 10 * np.log10(np.mean(predict, axis=-1) + np.finfo(np.float32).eps)
     p += 1
     x_axis = np.arange(len(pred_avg), dtype=np.float32)  # / SAMPLE_RATE
-    ax[p].plot(
-        x_axis,
-        pred_avg,
-        color="black",
-        linestyle="dashed",
-        label="Predict mean over freq.",
-    )
+    ax[p].plot(x_axis, pred_avg, color="black", linestyle="dashed", label="Predict mean over freq.")
     ax[p].set_ylabel("mean db", color="black")
     ax[p].set_xlim(x_axis[0], x_axis[-1])
     if truth_f is not None:
         truth_avg = 10 * np.log10(np.mean(truth_f, axis=-1) + np.finfo(np.float32).eps)
-        ax[p].plot(
-            x_axis,
-            truth_avg,
-            color="green",
-            linestyle="dashed",
-            label="Truth mean over freq.",
-        )
+        ax[p].plot(x_axis, truth_avg, color="green", linestyle="dashed", label="Truth mean over freq.")
     if metric is not None:  # instantiate 2nd y-axis that shares the same x-axis
         ax2 = ax[p].twinx()
@@ -317,7 +291,7 @@ def plot_e_predict_truth(
     truth_wav: np.ndarray | None = None,
     metric: np.ndarray | None = None,
     tp_title: str = "",
-) -> plt.Figure:
+) -> plt.Figure:  # pyright: ignore [reportPrivateImportUsage]
     """Plot predict spectrogram and waveform and optionally truth and a metric)"""
     num_plots = 2
     if truth_f is not None:
@@ -335,13 +309,7 @@ def plot_e_predict_truth(
     if truth_f is not None:  # plot truth if provided and use same colormap as predict
         p += 1
-        ax[p].imshow(
-            truth_f.transpose(),
-            im.cmap,
-            aspect="auto",
-            interpolation="nearest",
-            origin="lower",
-        )
+        ax[p].imshow(truth_f.transpose(), im.cmap, aspect="auto", interpolation="nearest", origin="lower")
         ax[p].set_title("Truth")
     # Plot predict wav, and optionally truth avg and metric lines
@@ -383,7 +351,17 @@ def plot_e_predict_truth(
     return fig
-def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
+def _process_mixture(
+    m_id: int,
+    truth_location: str,
+    predict_location: str,
+    predict_wav_mode: bool,
+    truth_est_mode: bool,
+    enable_plot: bool,
+    enable_wav: bool,
+    asr_method: str,
+    target_f_key: str,
+) -> tuple[pd.DataFrame, pd.DataFrame]:
     import pickle
     from os.path import basename
     from os.path import join
@@ -412,19 +390,10 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
     from sonusai.utils import unstack_complex
     from sonusai.utils import write_audio
-    global MP_GLOBAL
-    mixdb = MP_GLOBAL.mixdb
-    predict_location = MP_GLOBAL.predict_location
-    predict_wav_mode = MP_GLOBAL.predict_wav_mode
-    truth_est_mode = MP_GLOBAL.truth_est_mode
-    enable_plot = MP_GLOBAL.enable_plot
-    enable_wav = MP_GLOBAL.enable_wav
-    asr_method = MP_GLOBAL.asr_method
-    asr_model_name = MP_GLOBAL.asr_model_name
+    mixdb = MixtureDatabase(truth_location)
-    # 1)  Read predict data, var predict with shape [BatchSize,Classes] or [BatchSize,Tsteps,Classes]
-    output_name = join(predict_location, mixdb.mixture(mixid).name)
+    # 1)  Read predict data, var predict with shape [BatchSize,Classes] or [batch, timesteps, classes]
+    output_name = join(predict_location, mixdb.mixture(m_id).name + ".h5")
     predict = None
     if truth_est_mode:
         # in truth estimation mode we use the truth in place of prediction to see metrics with perfect input
@@ -439,31 +408,31 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
                     predict = np.array(f["predict"])
             except Exception as e:
                 raise OSError(f"Error reading {output_name}: {e}") from e
-            # reshape to always be [frames,classes] where ndim==3 case frames = batch * tsteps
+            # reshape to always be [frames, classes] where ndim==3 case frames = batch * timesteps
             if predict.ndim > 2:  # TBD generalize to somehow detect if timestep dim exists, some cases > 2 don't have
                 # logger.debug(f'Prediction reshape from {predict.shape} to remove timestep dimension.')
                 predict, _ = reshape_outputs(predict=predict, truth=None, timesteps=predict.shape[1])
         else:
             base_name, ext = splitext(output_name)
             predict_name = join(base_name + ".wav")
-            audio = read_audio(predict_name)
+            audio = read_audio(predict_name, use_cache=True)
             predict = forward_transform(audio, mixdb.ft_config)
             if mixdb.feature[0:1] == "h":
                 predict = power_compress(predict)
             predict = stack_complex(predict)
     # 2) Collect true target, noise, mixture data, trim to predict size if needed
-    tmp = mixdb.mixture_targets(mixid)  # targets is list of pre-IR and pre-specaugment targets
-    target_f = mixdb.mixture_targets_f(mixid, targets=tmp)[0]
+    tmp = mixdb.mixture_targets(m_id)  # time-dom augmented targets is list of pre-IR and pre-specaugment targets
+    target_f = mixdb.mixture_targets_f(m_id, targets=tmp)[0]
     target = tmp[0]
-    mixture = mixdb.mixture_mixture(mixid)  # note: gives full reverberated/distorted target, but no specaugment
+    mixture = mixdb.mixture_mixture(m_id)  # note: gives full reverberated/distorted target, but no specaugment
     # noise_wo_dist = mixdb.mixture_noise(mixid)            # noise without specaugment and distortion
     # noise_wo_dist_f = mixdb.mixture_noise_f(mixid, noise=noise_wo_dist)
     noise = mixture - target  # has time-domain distortion (ir,etc.) but does not have specaugment
     # noise_f = mixdb.mixture_noise_f(mixid, noise=noise)
     # note: uses pre-IR, pre-specaug audio
-    segsnr_f: np.ndarray = mixdb.mixture_metrics(mixid, ["ssnr"])[0]  # type: ignore[assignment]
-    mixture_f = mixdb.mixture_mixture_f(mixid, mixture=mixture)
+    segsnr_f = mixdb.mixture_metrics(m_id, ["ssnr"])["ssnr"][0]
+    mixture_f = mixdb.mixture_mixture_f(m_id, mixture=mixture)
     noise_f = mixture_f - target_f  # true noise in freq domain includes specaugment and time-domain ir,distortions
     # segsnr_f = mixdb.mixture_segsnr(mixid, target=target, noise=noise)
     segsnr_f[segsnr_f == np.inf] = DB_99
@@ -476,13 +445,21 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
     # gen feature, truth - note feature only used for plots
     # TODO: parse truth_f for different formats
-    feature, truth_f = mixdb.mixture_ft(mixid, mixture_f=mixture_f)
+    feature, truth_all = mixdb.mixture_ft(m_id, mixture_f=mixture_f)
+    truth_f = truth_all[target_f_key]
+    if truth_f.ndim > 2:  # note this may not be needed anymore as all target_f truth is 3 dims
+        if truth_f.shape[1] != 1:
+            logger.info("Error: target_f truth has stride > 1, exiting.")
+            raise SystemExit(1)
+        else:
+            truth_f = truth_f[:, 0, :]  # remove stride dimension
     # ignore mixup
-    for truth_setting in mixdb.target_file(mixdb.mixture(mixid).targets[0].file_id).truth_configs:
-        if truth_setting.function == "target_mixture_f":
-            half = truth_f.shape[-1] // 2
-            # extract target_f only
-            truth_f = truth_f[..., :half]
+    # for truth_setting in mixdb.target_file(mixdb.mixture(mixid).targets[0].file_id).truth_settings:
+    #     if truth_setting.function == 'target_mixture_f':
+    #         half = truth_f.shape[-1] // 2
+    #         # extract target_f only
+    #         truth_f = truth_f[..., :half]
     if not truth_est_mode:
         if predict.shape[0] < target_f.shape[0]:  # target_f, truth_f, mixture_f, etc. same size
@@ -503,15 +480,17 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
             )
             trim_f = predict.shape[0] - target_f.shape[0]
             predict = predict[0:-trim_f, :]
-            # raise ValueError(
+            # raise SonusAIError(
             #     f'Error: prediction has more frames than true mixture {predict.shape[0]} vs {truth_f.shape[0]}')
     # 3) Extraction - format proper complex and wav estimates and truth (unstack, uncompress, inv tf, etc.)
     if truth_est_mode:
         predict = truth_f  # substitute truth for the prediction (for test/debug)
         predict_complex = unstack_complex(predict)  # unstack
-        # if feat has compressed mag and truth does not, compress it
-        if mixdb.feature[0:1] == "h" and mixdb.target_file(1).truth_configs[0].function[0:10] != "targetcmpr":
+        # if feature has compressed mag and truth does not, compress it
+        if mixdb.feature[0:1] == "h" and not mixdb.truth_configs[first_key(mixdb.truth_configs)].function.startswith(
+            "targetcmpr"
+        ):
             predict_complex = power_compress(predict_complex)  # from uncompressed truth
     else:
         predict_complex = unstack_complex(predict)
@@ -556,10 +535,14 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
     # logger.debug(f'wsdr ccoefs for mixid {mixid} = {wsdr_cc}.')
     # Speech intelligibility measure - PESQ
-    if int(mixdb.mixture(mixid).snr) > -99:
+    if int(mixdb.mixture(m_id).snr) > -99:
         # len = target_est_wav.shape[0]
         pesq_speech, csig_tg, cbak_tg, covl_tg = calc_speech(target_est_wav, target_fi)
-        pesq_mixture, csig_mx, cbak_mx, covl_mx = mixdb.mixture_metrics(mixid, ["mxpesq", "mxcsig", "mxcbak", "mxcovl"])
+        metrics = mixdb.mixture_metrics(m_id, ["mxpesq", "mxcsig", "mxcbak", "mxcovl"])
+        pesq_mixture = metrics["mxpesq"]
+        csig_mx = metrics["mxcsig"]
+        cbak_mx = metrics["mxcbak"]
+        covl_mx = metrics["mxcovl"]
         # pesq_speech_tst = calc_pesq(hypothesis=target_est_wav, reference=target)
         # pesq_mixture_tst = calc_pesq(hypothesis=mixture, reference=target)
         # pesq improvement
@@ -581,25 +564,37 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
     asr_tt = None
     asr_mx = None
     asr_tge = None
-    asr_engines = list(mixdb.asr_configs.keys())
-    if len(asr_engines) > 0 and not mixdb.mixture(mixid).is_noise_only:  # noise only, ignore/reset target asr
-        wer_mx = float(mixdb.mixture_metrics(mixid, [f"mxwer.{asr_engines[0]}"])[0]) * 100
-        asr_tt = MP_GLOBAL.mixdb.mixture_speech_metadata(mixid, "text")[0]  # ignore mixup
-        if asr_tt is None:
-            asr_tt = calc_asr(target, engine=asr_method, whisper_model_name=asr_model_name).text  # target truth
+    # asr_engines = list(mixdb.asr_configs.keys())
+    if asr_method is not None and mixdb.mixture(m_id).snr >= -96:  # noise only, ignore/reset target ASR
+        asr_mx_name = f"mxasr.{asr_method}"
+        wer_mx_name = f"mxwer.{asr_method}"
+        asr_tt_name = f"tasr.{asr_method}"
+        metrics = mixdb.mixture_metrics(m_id, [asr_mx_name, wer_mx_name, asr_tt_name])
+        asr_mx = metrics[asr_mx_name][0]
+        wer_mx = metrics[wer_mx_name][0]
+        asr_tt = metrics[asr_tt_name][0]
         if asr_tt:
-            asr_tge = calc_asr(target_est_wav, engine=asr_method, whisper_model_name=asr_model_name).text
-            wer_tge = calc_wer(asr_tge, asr_tt).wer * 100  # target estimate wer
+            noiseadd = None  # TBD add as switch, default -30
+            if noiseadd is not None:
+                ngain = np.power(10, min(float(noiseadd), 0.0) / 20.0)  # limit to gain <1, convert to float
+                tgasr_est_wav = target_est_wav + ngain * noise_est_wav  # add back noise at low level
+            else:
+                tgasr_est_wav = target_est_wav
+            # logger.info(f'Calculating prediction ASR for mixid {mixid}')
+            asr_cfg = mixdb.asr_configs[asr_method]
+            asr_tge = calc_asr(tgasr_est_wav, **asr_cfg).text
+            wer_tge = calc_wer(asr_tge, asr_tt).wer * 100  # target estimate WER
             if wer_mx == 0.0:
                 if wer_tge == 0.0:
                     wer_pi = 0.0
                 else:
-                    wer_pi = -999.0
+                    wer_pi = -999.0  # instead of -Inf
             else:
                 wer_pi = 100 * (wer_mx - wer_tge) / wer_mx
         else:
-            print(f"Warning: mixid {mixid} asr truth is empty, setting to 0% wer")
+            logger.warning(f"Warning: mixid {m_id} ASR truth is empty, setting to 0% WER")
             wer_mx = float(0)
             wer_tge = float(0)
             wer_pi = float(0)
@@ -633,10 +628,10 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
         "SPFILE",
         "NFILE",
     ]
-    ti = mixdb.mixture(mixid).targets[0].file_id
-    ni = mixdb.mixture(mixid).noise.file_id
+    ti = mixdb.mixture(m_id).targets[0].file_id
+    ni = mixdb.mixture(m_id).noise.file_id
     metr1 = [
-        mixdb.mixture(mixid).snr,
+        mixdb.mixture(m_id).snr,
         pesq_mixture,
         pesq_speech,
         pesq_impr_pc,
@@ -658,17 +653,11 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
         basename(mixdb.target_file(ti).name),
         basename(mixdb.noise_file(ni).name),
     ]
-    mtab1 = pd.DataFrame([metr1], columns=mtable1_col, index=[mixid])
+    mtab1 = pd.DataFrame([metr1], columns=mtable1_col, index=[m_id])
     # Stats of per frame estimation metrics
     metr2 = pd.DataFrame(
-        {
-            "SSNR": segsnr_f,
-            "PCM": pcm_frame,
-            "SLERR": lerr_tg_frame,
-            "NLERR": lerr_n_frame,
-            "SPD": phd_frame,
-        }
+        {"SSNR": segsnr_f, "PCM": pcm_frame, "SLERR": lerr_tg_frame, "NLERR": lerr_n_frame, "SPD": phd_frame}
     )
     metr2 = metr2.describe()  # Use pandas stat function
     # Change SSNR stats to dB, except count.  SSNR is index 0, pandas requires using iloc
@@ -679,29 +668,33 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
         [metr2.columns, ["Avg", "Min", "Med", "Max", "Std"]], names=["Metric", "Stat"]
     )
     dat1row = metr2.loc[["mean", "min", "50%", "max", "std"], :].T.stack().to_numpy().reshape((1, -1))
-    mtab2 = pd.DataFrame(dat1row, index=[mixid], columns=new_labels)
-    mtab2.insert(0, "MXSNR", mixdb.mixture(mixid).snr, False)  # add MXSNR as the first metric column
+    mtab2 = pd.DataFrame(dat1row, index=[m_id], columns=new_labels)
+    mtab2.insert(0, "MXSNR", mixdb.mixture(m_id).snr, False)  # add MXSNR as the first metric column
     all_metrics_table_1 = mtab1  # return to be collected by process
     all_metrics_table_2 = mtab2  # return to be collected by process
-    metric_name = base_name + "_metric_spenh.txt"
-    with open(metric_name, "w") as f, redirect_stdout(f):
-        print("Speech enhancement metrics:")
-        print(mtab1.round(2).to_string(float_format=lambda x: f"{x:.2f}"))
-        print("")
-        print(f"Extraction statistics over {mixture_f.shape[0]} frames:")
-        print(metr2.round(2).to_string(float_format=lambda x: f"{x:.2f}"))
-        print("")
-        print(f"Target path: {mixdb.target_file(ti).name}")
-        print(f"Noise path: {mixdb.noise_file(ni).name}")
+    if asr_method is None:
+        metric_name = base_name + "_metric_spenh.txt"
+    else:
+        metric_name = base_name + "_metric_spenh_" + asr_method + ".txt"
+    with open(metric_name, "w") as f:
+        print("Speech enhancement metrics:", file=f)
+        print(mtab1.round(2).to_string(float_format=lambda x: f"{x:.2f}"), file=f)
+        print("", file=f)
+        print(f"Extraction statistics over {mixture_f.shape[0]} frames:", file=f)
+        print(metr2.round(2).to_string(float_format=lambda x: f"{x:.2f}"), file=f)
+        print("", file=f)
+        print(f"Target path: {mixdb.target_file(ti).name}", file=f)
+        print(f"Noise path: {mixdb.noise_file(ni).name}", file=f)
         if asr_method != "none":
-            print(f"ASR method: {asr_method} and whisper model (if used):  {asr_model_name}")
-            print(f"ASR truth:  {asr_tt}")
-            print(f"ASR result for mixture:  {asr_mx}")
-            print(f"ASR result for prediction:  {asr_tge}")
+            print(f"ASR method: {asr_method}", file=f)
+            print(f"ASR truth:  {asr_tt}", file=f)
+            print(f"ASR result for mixture:  {asr_mx}", file=f)
+            print(f"ASR result for prediction:  {asr_tge}", file=f)
-        print(f"Augmentations: {mixdb.mixture(mixid)}")
+        print(f"Augmentations: {mixdb.mixture(m_id)}", file=f)
     # 7) write wav files
     if enable_wav:
@@ -728,7 +721,7 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
         # Reshape to get frames*decimated_stride, num_bands
         step = int(mixdb.feature_samples / mixdb.feature_step_samples)
         if feature.ndim != 3:
-            raise ValueError("feature does not have 3 dimensions: frames, stride, num_bands")
+            raise OSError("feature does not have 3 dimensions: frames, stride, num_bands")
         # for feature cn*00n**
         feat_sgram = unstack_complex(feature)
@@ -738,17 +731,19 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
         with PdfPages(plot_name) as pdf:
             # page1 we always have a mixture and prediction, target optional if truth provided
-            tfunc_name = mixdb.target_file(1).truth_configs[0].function  # first target, assumes all have same
-            if tfunc_name == "mapped_snr_f":
-                # leave as unmapped snr
-                predplot = predict
-                tfunc_name = mixdb.target_file(1).truth_configs[0].function
-            elif tfunc_name in ("target_f", "target_mixture_f"):
-                predplot = 20 * np.log10(abs(predict_complex) + np.finfo(np.float32).eps)
-            else:
-                # use dB scale
-                predplot = 10 * np.log10(predict + np.finfo(np.float32).eps)
-                tfunc_name = tfunc_name + " (db)"
+            # For speech enhancement, target_f is definitely included:
+            predplot = 20 * np.log10(abs(predict_complex) + np.finfo(np.float32).eps)
+            tfunc_name = "target_f"
+            # if tfunc_name == 'mapped_snr_f':
+            #     # leave as unmapped snr
+            #     predplot = predict
+            #     tfunc_name = mixdb.target_file(1).truth_settings[0].function
+            # elif tfunc_name == 'target_f' or 'target_mixture_f':
+            #     predplot = 20 * np.log10(abs(predict_complex) + np.finfo(np.float32).eps)
+            # else:
+            #     # use dB scale
+            #     predplot = 10 * np.log10(predict + np.finfo(np.float32).eps)
+            #     tfunc_name = tfunc_name + ' (db)'
             mixspec = 20 * np.log10(abs(mixture_f) + np.finfo(np.float32).eps)
             fig_obj = plot_mixpred(
@@ -816,8 +811,7 @@ def main():
     verbose = args["--verbose"]
     mixids = args["--mixid"]
-    asr_method = args["--asr-method"].lower()
-    asr_model_name = args["--model"].lower()
+    asr_method = args["--asr-method"]
     truth_est_mode = args["--truth-est-mode"]
     enable_plot = args["--plot"]
     enable_wav = args["--wav"]
@@ -827,6 +821,7 @@ def main():
     truth_location = args["TLOC"]
     import glob
+    from functools import partial
     from os.path import basename
     from os.path import isdir
     from os.path import join
@@ -837,16 +832,13 @@ def main():
     from sonusai import initial_log_messages
     from sonusai import logger
     from sonusai import update_console_handler
-    from sonusai.mixture import DEFAULT_SPEECH
     from sonusai.mixture import MixtureDatabase
-    from sonusai.mixture import read_audio
-    from sonusai.utils import calc_asr
     from sonusai.utils import par_track
     from sonusai.utils import track
     # Check prediction subdirectory
     if not isdir(predict_location):
-        print(f"The specified predict location {predict_location} is not a valid subdirectory path, exiting ...")
+        print(f"The specified predict location {predict_location} is not a valid subdirectory path, exiting.")
     # all_predict_files = listdir(predict_location)
     all_predict_files = glob.glob(predict_location + "/*.h5")
@@ -855,7 +847,7 @@ def main():
     if len(all_predict_files) <= 0 and not truth_est_mode:
         all_predict_files = glob.glob(predict_location + "/*.wav")  # check for wav files
         if len(all_predict_files) <= 0:
-            print(f"Subdirectory {predict_location} has no .h5 or .wav files, exiting ...")
+            print(f"Subdirectory {predict_location} has no .h5 or .wav files, exiting.")
         else:
             logger.info(f"Found {len(all_predict_files)} prediction .wav files.")
             predict_wav_mode = True
@@ -877,59 +869,40 @@ def main():
     logger.info(
         f"Found mixdb of {mixdb.num_mixtures} total mixtures, with {mixdb.num_classes} classes in {truth_location}"
     )
-    logger.info(f"Only running specified subset of {len(mixids)} mixtures")
+    # speech enhancement metrics and audio truth requires target_f truth type, check it is present
+    target_f_key = None
+    logger.info(f"mixdb has {len(mixdb.truth_configs)} truth types defined, checking that target_f type is present.")
+    for key in mixdb.truth_configs:
+        if mixdb.truth_configs[key].function == "target_f":
+            target_f_key = key
+    if target_f_key is None:
+        logger.error("mixdb does not have target_f truth define, required for speech enhancement metrics, exiting.")
+        raise SystemExit(1)
-    enable_asr_warmup = False
-    if asr_method == "none":
-        fnb = "metric_spenh_"
-    elif asr_method == "google":
-        fnb = "metric_spenh_ggl_"
-        logger.info(f"ASR enabled with method {asr_method}")
-        enable_asr_warmup = True
-    elif asr_method == "deepgram":
-        fnb = "metric_spenh_dgram_"
-        logger.info(f"ASR enabled with method {asr_method}")
-        enable_asr_warmup = True
-    elif asr_method == "aixplain_whisper":
-        fnb = "metric_spenh_whspx_" + asr_model_name + "_"
-        logger.info(f"ASR enabled with method {asr_method} and whisper model {asr_model_name}")
-        enable_asr_warmup = True
-    elif asr_method == "whisper":
-        fnb = "metric_spenh_whspl_" + asr_model_name + "_"
-        logger.info(f"ASR enabled with method {asr_method} and whisper model {asr_model_name}")
-        enable_asr_warmup = True
-    elif asr_method == "aaware_whisper":
-        fnb = "metric_spenh_whspaaw_" + asr_model_name + "_"
-        logger.info(f"ASR enabled with method {asr_method} and whisper model {asr_model_name}")
-        enable_asr_warmup = True
-    elif asr_method == "faster_whisper":
-        fnb = "metric_spenh_fwhsp_" + asr_model_name + "_"
-        logger.info(f"ASR enabled with method {asr_method} and whisper model {asr_model_name}")
-        enable_asr_warmup = True
-    elif asr_method == "sensory":
-        fnb = "metric_spenh_snsr_" + asr_model_name + "_"
-        logger.info(f"ASR enabled with method {asr_method} and model {asr_model_name}")
-        enable_asr_warmup = True
-    else:
-        logger.error(f"Unrecognized ASR method: {asr_method}")
-        return
-    if enable_asr_warmup:
-        audio = read_audio(DEFAULT_SPEECH)
-        logger.info("Warming up asr method, note for cloud service this could take up to a few min ...")
-        asr_chk = calc_asr(audio, engine=asr_method, whisper_model_name=asr_model_name)
-        logger.info(f"Warmup completed, results {asr_chk}")
-    global MP_GLOBAL
+    logger.info(f"Only running specified subset of {len(mixids)} mixtures")
-    MP_GLOBAL.mixdb = mixdb
-    MP_GLOBAL.predict_location = predict_location
-    MP_GLOBAL.predict_wav_mode = predict_wav_mode
-    MP_GLOBAL.truth_est_mode = truth_est_mode
-    MP_GLOBAL.enable_plot = enable_plot
-    MP_GLOBAL.enable_wav = enable_wav
-    MP_GLOBAL.asr_method = asr_method
-    MP_GLOBAL.asr_model_name = asr_model_name
+    asr_config_en = None
+    fnb = "metric_spenh_"
+    if asr_method is not None:
+        if asr_method in mixdb.asr_configs:
+            logger.info(f"Specified ASR method {asr_method} exists in mixdb.asr_configs, it will be used for ")
+            logger.info("prediction ASR and WER, and pre-calculated target and mixture ASR if available.")
+            asr_config_en = True
+            asr_cfg = mixdb.asr_configs[asr_method]
+            fnb = "metric_spenh_" + asr_method + "_"
+            logger.info(f"Using ASR cfg: {asr_cfg} ")
+            # audio = read_audio(DEFAULT_SPEECH, use_cache=True)
+            # logger.info(f'Warming up {asr_method}, note for cloud service this could take up to a few minutes.')
+            # asr_chk = calc_asr(audio, **asr_cfg)
+            # logger.info(f'Warmup completed, results {asr_chk}')
+        else:
+            logger.info(
+                f"Specified ASR method {asr_method} does not exists in mixdb.asr_configs."
+                f"Must choose one of the following (or none):"
+            )
+            logger.info(f"{', '.join(mixdb.asr_configs)}")
+            logger.error("Unrecognized ASR method, exiting.")
+            raise SystemExit(1)
     num_cpu = psutil.cpu_count()
     cpu_percent = psutil.cpu_percent(interval=1)
@@ -944,12 +917,33 @@ def main():
     # Individual mixtures use pandas print, set precision to 2 decimal places
     # pd.set_option('float_format', '{:.2f}'.format)
-    logger.info(f"Calculating metrics for {len(mixids)} mixtures using {use_cpu} parallel processes ...")
-    progress = track(total=len(mixids), desc="calc_metric_spenh")
+    logger.info(f"Calculating metrics for {len(mixids)} mixtures using {use_cpu} parallel processes")
+    # progress = tqdm(total=len(mixids), desc='calc_metric_spenh', mininterval=1)
+    progress = track(total=len(mixids))
     if use_cpu is None:
-        all_metrics_tables = par_track(_process_mixture, mixids, progress=progress, no_par=True)
+        no_par = True
+        num_cpus = None
     else:
-        all_metrics_tables = par_track(_process_mixture, mixids, progress=progress, num_cpus=use_cpu)
+        no_par = True
+        num_cpus = None
+    all_metrics_tables = par_track(
+        partial(
+            _process_mixture,
+            truth_location=truth_location,
+            predict_location=predict_location,
+            predict_wav_mode=predict_wav_mode,
+            truth_est_mode=truth_est_mode,
+            enable_plot=enable_plot,
+            enable_wav=enable_wav,
+            asr_method=asr_method,
+            target_f_key=target_f_key,
+        ),
+        mixids,
+        progress=progress,
+        num_cpus=num_cpus,
+        no_par=no_par,
+    )
     progress.close()
     all_metrics_table_1 = pd.concat([item[0] for item in all_metrics_tables])
@@ -1010,7 +1004,7 @@ def main():
             all_nom99_mean["WERi%"] = 0.0
         else:
             all_nom99_mean["WERi%"] = -999.0
-    else:  # wer%
+    else:  # WER%
         all_nom99_mean["WERi%"] = 100 * (all_nom99_mean["MXWER"] - all_nom99_mean["WER"]) / all_nom99_mean["MXWER"]
     num_mix = len(mixids)
@@ -1023,33 +1017,37 @@ def main():
         else:
             ofname = join(predict_location, fnb + "summary_truest.txt")
-        with open(ofname, "w") as f, redirect_stdout(f):
-            print(f"ASR enabled with method {asr_method}, whisper model, if used: {asr_model_name}")
-            print(f"Speech enhancement metrics avg over all {len(all_mtab1_sorted_nom99)} non -99 SNR mixtures:")
-            print(all_nom99_mean.to_frame().T.round(2).to_string(float_format=lambda x: f"{x:.2f}", index=False))
-            print("\nSpeech enhancement metrics avg over each SNR:")
-            print(mtab_snr_summary.round(2).to_string(float_format=lambda x: f"{x:.2f}", index=False))
-            print("")
-            print("Extraction statistics stats avg over each SNR:")
+        with open(ofname, "w") as f:
+            print(f"ASR enabled with method {asr_method}", file=f)
+            print(
+                f"Speech enhancement metrics avg over all {len(all_mtab1_sorted_nom99)} non -99 SNR mixtures:", file=f
+            )
+            print(
+                all_nom99_mean.to_frame().T.round(2).to_string(float_format=lambda x: f"{x:.2f}", index=False), file=f
+            )
+            print("\nSpeech enhancement metrics avg over each SNR:", file=f)
+            print(mtab_snr_summary.round(2).to_string(float_format=lambda x: f"{x:.2f}", index=False), file=f)
+            print("", file=f)
+            print("Extraction statistics stats avg over each SNR:", file=f)
             # with pd.option_context('display.max_colwidth', 9):
             # with pd.set_option('float_format', '{:.1f}'.format):
-            print(mtab_snr_summary_em.round(1).to_string(float_format=lambda x: f"{x:.1f}", index=False))
-            print("")
+            print(mtab_snr_summary_em.round(1).to_string(float_format=lambda x: f"{x:.1f}", index=False), file=f)
+            print("", file=f)
             # pd.set_option('float_format', '{:.2f}'.format)
-            print(f"Speech enhancement metrics stats over all {num_mix} mixtures:")
-            print(all_metrics_table_1.describe().round(2).to_string(float_format=lambda x: f"{x:.2f}"))
-            print("")
-            print(f"Extraction statistics stats over all {num_mix} mixtures:")
-            print(all_metrics_table_2.describe().round(2).to_string(float_format=lambda x: f"{x:.1f}"))
-            print("")
+            print(f"Speech enhancement metrics stats over all {num_mix} mixtures:", file=f)
+            print(all_metrics_table_1.describe().round(2).to_string(float_format=lambda x: f"{x:.2f}"), file=f)
+            print("", file=f)
+            print(f"Extraction statistics stats over all {num_mix} mixtures:", file=f)
+            print(all_metrics_table_2.describe().round(2).to_string(float_format=lambda x: f"{x:.1f}"), file=f)
+            print("", file=f)
-            print("Speech enhancement metrics all-mixtures list:")
-            # print(all_metrics_table_1.head().style.format(precision=2))
-            print(all_metrics_table_1.round(2).to_string(float_format=lambda x: f"{x:.2f}"))
-            print("")
-            print("Extraction statistics all-mixtures list:")
-            print(all_metrics_table_2.round(2).to_string(float_format=lambda x: f"{x:.1f}"))
+            print("Speech enhancement metrics all-mixtures list:", file=f)
+            # print(all_metrics_table_1.head().style.format(precision=2), file=f)
+            print(all_metrics_table_1.round(2).to_string(float_format=lambda x: f"{x:.2f}"), file=f)
+            print("", file=f)
+            print("Extraction statistics all-mixtures list:", file=f)
+            print(all_metrics_table_2.round(2).to_string(float_format=lambda x: f"{x:.1f}"), file=f)
         # Write summary to .csv file
         if not truth_est_mode:
@@ -1084,7 +1082,7 @@ def main():
         label = f"Extraction statistics stats over {num_mix} mixtures:"
         pd.DataFrame([label]).to_csv(csv_name, **header_args)
         all_metrics_table_2.describe().round(2).to_csv(csv_name, **table_args)
-        label = f"ASR enabled with method {asr_method}, whisper model, if used: {asr_model_name}"
+        label = f"ASR enabled with method {asr_method}"
         pd.DataFrame([label]).to_csv(csv_name, **header_args)
         if not truth_est_mode:
@@ -1104,3 +1102,37 @@ def main():
 if __name__ == "__main__":
     main()
+# if asr_method == 'none':
+#     fnb = 'metric_spenh_'
+# elif asr_method == 'google':
+#     fnb = 'metric_spenh_ggl_'
+#     logger.info(f'ASR enabled with method {asr_method}')
+#     enable_asr_warmup = True
+# elif asr_method == 'deepgram':
+#     fnb = 'metric_spenh_dgram_'
+#     logger.info(f'ASR enabled with method {asr_method}')
+#     enable_asr_warmup = True
+# elif asr_method == 'aixplain_whisper':
+#     fnb = 'metric_spenh_whspx_' + mixdb.asr_configs[asr_method]['model'] + '_'
+#     asr_model_name = mixdb.asr_configs[asr_method]['model']
+#     enable_asr_warmup = True
+# elif asr_method == 'whisper':
+#     fnb = 'metric_spenh_whspl_' + mixdb.asr_configs[asr_method]['model'] + '_'
+#     asr_model_name = mixdb.asr_configs[asr_method]['model']
+#     enable_asr_warmup = True
+# elif asr_method == 'aaware_whisper':
+#     fnb = 'metric_spenh_whspaaw_' + mixdb.asr_configs[asr_method]['model'] + '_'
+#     asr_model_name = mixdb.asr_configs[asr_method]['model']
+#     enable_asr_warmup = True
+# elif asr_method == 'faster_whisper':
+#     fnb = 'metric_spenh_fwhsp_' + mixdb.asr_configs[asr_method]['model'] + '_'
+#     asr_model_name = mixdb.asr_configs[asr_method]['model']
+#     enable_asr_warmup = True
+# elif asr_method == 'sensory':
+#     fnb = 'metric_spenh_snsr_' + mixdb.asr_configs[asr_method]['model'] + '_'
+#     asr_model_name = mixdb.asr_configs[asr_method]['model']
+#     enable_asr_warmup = True
+# else:
+#     logger.error(f'Unrecognized ASR method: {asr_method}')
+#     return

sonusai 0.19.9__py3-none-any.whl → 0.20.2__py3-none-any.whl

sonusai 0.19.9py3-none-any.whl → 0.20.2py3-none-any.whl