sonusai 0.17.2__tar.gz → 0.18.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sonusai-0.17.2 → sonusai-0.18.0}/PKG-INFO +4 -1
- {sonusai-0.17.2 → sonusai-0.18.0}/pyproject.toml +6 -2
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/__init__.py +0 -1
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/audiofe.py +3 -3
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/calc_metric_spenh.py +81 -52
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/doc/doc.py +0 -24
- sonusai-0.18.0/sonusai/genmetrics.py +146 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/genmixdb.py +0 -2
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/__init__.py +0 -1
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/constants.py +0 -1
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/datatypes.py +2 -9
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/generation.py +136 -38
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/helpers.py +58 -1
- sonusai-0.18.0/sonusai/mixture/mapped_snr_f.py +100 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/mixdb.py +293 -170
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/sox_augmentation.py +3 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/tokenized_shell_vars.py +8 -1
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mkwav.py +4 -4
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/onnx_predict.py +2 -2
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/post_spenh_targetf.py +2 -2
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/speech/textgrid.py +6 -24
- sonusai-0.17.2/sonusai/speech/voxceleb2.py → sonusai-0.18.0/sonusai/speech/voxceleb.py +19 -3
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/__init__.py +1 -1
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/asr_functions/aaware_whisper.py +2 -2
- sonusai-0.17.2/sonusai/utils/wave.py → sonusai-0.18.0/sonusai/utils/write_audio.py +2 -2
- sonusai-0.17.2/sonusai/mixture/mapped_snr_f.py +0 -53
- sonusai-0.17.2/sonusai/mixture/speaker_metadata.py +0 -35
- sonusai-0.17.2/sonusai/mkmanifest.py +0 -209
- sonusai-0.17.2/sonusai/utils/asr_manifest_functions/__init__.py +0 -6
- sonusai-0.17.2/sonusai/utils/asr_manifest_functions/data.py +0 -1
- sonusai-0.17.2/sonusai/utils/asr_manifest_functions/librispeech.py +0 -46
- sonusai-0.17.2/sonusai/utils/asr_manifest_functions/mcgill_speech.py +0 -29
- sonusai-0.17.2/sonusai/utils/asr_manifest_functions/vctk_noisy_speech.py +0 -66
- {sonusai-0.17.2 → sonusai-0.18.0}/README.rst +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/aawscd_probwrite.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/data/__init__.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/data/genmixdb.yml +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/data/speech_ma01_01.wav +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/data/whitenoise.wav +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/doc/__init__.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/doc.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/genft.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/genmix.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/gentcst.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/lsdb.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/main.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/metrics/__init__.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/metrics/calc_class_weights.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/metrics/calc_optimal_thresholds.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/metrics/calc_pcm.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/metrics/calc_pesq.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/metrics/calc_sa_sdr.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/metrics/calc_sample_weights.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/metrics/calc_wer.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/metrics/calc_wsdr.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/metrics/class_summary.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/metrics/confusion_matrix_summary.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/metrics/one_hot.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/metrics/snr_summary.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/audio.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/augmentation.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/class_count.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/config.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/eq_rule_is_valid.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/feature.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/log_duration_and_sizes.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/soundfile_audio.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/sox_audio.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/spectral_mask.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/target_class_balancing.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/targets.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/torchaudio_audio.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/torchaudio_augmentation.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/truth.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/truth_functions/__init__.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/truth_functions/crm.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/truth_functions/data.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/truth_functions/energy.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/truth_functions/file.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/truth_functions/phoneme.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/truth_functions/sed.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/truth_functions/target.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/plot.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/queries/__init__.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/queries/queries.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/speech/__init__.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/speech/l2arctic.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/speech/librispeech.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/speech/mcgill.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/speech/timit.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/speech/types.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/speech/vctk.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/summarize_metric_spenh.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/tplot.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/asl_p56.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/asr.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/asr_functions/__init__.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/audio_devices.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/braced_glob.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/calculate_input_shape.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/convert_string_to_number.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/create_timestamp.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/create_ts_name.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/dataclass_from_dict.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/db.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/docstring.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/energy_f.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/engineering_number.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/get_frames_per_batch.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/get_label_names.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/grouper.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/human_readable_size.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/max_text_width.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/model_utils.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/numeric_conversion.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/onnx_utils.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/parallel.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/path_info.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/print_mixture_details.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/ranges.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/read_mixture_data.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/read_predict_data.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/reshape.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/seconds_to_hms.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/stacked_complex.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/stratified_shuffle_split.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/yes_or_no.py +0 -0
- {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/vars.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sonusai
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.18.0
|
4
4
|
Summary: Framework for building deep neural network models for sound, speech, and voice AI
|
5
5
|
Home-page: https://aaware.com
|
6
6
|
License: GPL-3.0-only
|
@@ -21,12 +21,15 @@ Requires-Dist: h5py (>=3.11.0,<4.0.0)
|
|
21
21
|
Requires-Dist: jiwer (>=3.0.3,<4.0.0)
|
22
22
|
Requires-Dist: librosa (>=0.10.1,<0.11.0)
|
23
23
|
Requires-Dist: matplotlib (>=3.8.0,<4.0.0)
|
24
|
+
Requires-Dist: mgzip (>=0.2.1,<0.3.0)
|
25
|
+
Requires-Dist: numpy (>=1.26.4,<2.0.0)
|
24
26
|
Requires-Dist: onnx (>=1.14.1,<2.0.0)
|
25
27
|
Requires-Dist: onnxruntime (>=1.16.1,<2.0.0)
|
26
28
|
Requires-Dist: paho-mqtt (>=2.0.0,<3.0.0)
|
27
29
|
Requires-Dist: pandas (>=2.1.1,<3.0.0)
|
28
30
|
Requires-Dist: pesq (>=0.0.4,<0.0.5)
|
29
31
|
Requires-Dist: praatio (>=6.2.0,<7.0.0)
|
32
|
+
Requires-Dist: psutil (>=5,<6)
|
30
33
|
Requires-Dist: pyaaware (>=1.5.7,<2.0.0)
|
31
34
|
Requires-Dist: pyaudio (>=0.2.14,<0.3.0)
|
32
35
|
Requires-Dist: pydub (>=0.25.1,<0.26.0)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "sonusai"
|
3
|
-
version = "0.
|
3
|
+
version = "0.18.0"
|
4
4
|
description = "Framework for building deep neural network models for sound, speech, and voice AI"
|
5
5
|
authors = ["Chris Eddington <chris@aaware.com>", "Jason Calderwood <jason@aaware.com>"]
|
6
6
|
maintainers = ["Chris Eddington <chris@aaware.com>", "Jason Calderwood <jason@aaware.com>"]
|
@@ -21,12 +21,15 @@ h5py = "^3.11.0"
|
|
21
21
|
jiwer = "^3.0.3"
|
22
22
|
librosa = "^0.10.1"
|
23
23
|
matplotlib = "^3.8.0"
|
24
|
+
mgzip = "^0.2.1"
|
25
|
+
numpy = "^1.26.4"
|
24
26
|
onnx = "^1.14.1"
|
25
27
|
onnxruntime = "^1.16.1"
|
26
28
|
paho-mqtt = "^2.0.0"
|
27
29
|
pandas = "^2.1.1"
|
28
30
|
pesq = "^0.0.4"
|
29
31
|
praatio = "^6.2.0"
|
32
|
+
psutil = "^5"
|
30
33
|
pyaaware = "^1.5.7"
|
31
34
|
pyaudio = "^0.2.14"
|
32
35
|
pydub = "^0.25.1"
|
@@ -47,7 +50,8 @@ mypy = "^1.6.0"
|
|
47
50
|
mypy-extensions = "^1.0.0"
|
48
51
|
pytest = "^8.1.1"
|
49
52
|
sonusai-asr-cloud = "^0.1.0"
|
50
|
-
sonusai-
|
53
|
+
sonusai-asr-sensory = "^0.1.0"
|
54
|
+
sonusai-torchl = "^0.3.0"
|
51
55
|
types-pyyaml = "^6.0.12.12"
|
52
56
|
types-requests = "^2.31.0.8"
|
53
57
|
|
@@ -14,7 +14,6 @@ commands_doc = """
|
|
14
14
|
genmixdb Generate a mixture database
|
15
15
|
gentcst Generate target configuration from a subdirectory tree
|
16
16
|
lsdb List information about a mixture database
|
17
|
-
mkmanifest Make ASR manifest JSON file
|
18
17
|
mkwav Make WAV files from a mixture database
|
19
18
|
onnx_predict Run ONNX predict on a trained model
|
20
19
|
plot Plot mixture data
|
@@ -86,7 +86,7 @@ def main() -> None:
|
|
86
86
|
from sonusai.utils import create_timestamp
|
87
87
|
from sonusai.utils import get_input_devices
|
88
88
|
from sonusai.utils import load_ort_session
|
89
|
-
from sonusai.utils import
|
89
|
+
from sonusai.utils import write_audio
|
90
90
|
|
91
91
|
ts = create_timestamp()
|
92
92
|
capture_name = f'audiofe_capture_{ts}'
|
@@ -121,7 +121,7 @@ def main() -> None:
|
|
121
121
|
logger.exception(e)
|
122
122
|
return
|
123
123
|
# Only write if capture from device, not for file input
|
124
|
-
|
124
|
+
write_audio(capture_wav, capture_audio, SAMPLE_RATE)
|
125
125
|
logger.info('')
|
126
126
|
logger.info(f'Wrote capture audio with shape {capture_audio.shape} to {capture_wav}')
|
127
127
|
|
@@ -175,7 +175,7 @@ def main() -> None:
|
|
175
175
|
logger.info(f'Wrote predict with shape {predict.shape} to {h5_name}')
|
176
176
|
|
177
177
|
predict_audio = get_audio_from_feature(feature=predict, feature_mode=feature_mode)
|
178
|
-
|
178
|
+
write_audio(predict_wav, predict_audio, SAMPLE_RATE)
|
179
179
|
logger.info(f'Wrote predict audio with shape {predict_audio.shape} to {predict_wav}')
|
180
180
|
if debug:
|
181
181
|
with h5py.File(h5_name, 'a') as f:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
"""sonusai calc_metric_spenh
|
2
2
|
|
3
|
-
usage: calc_metric_spenh [-hvtpws] [-i MIXID] [-e ASR] [-m MODEL] PLOC TLOC
|
3
|
+
usage: calc_metric_spenh [-hvtpws] [-i MIXID] [-e ASR] [-m MODEL] [-n NCPU] PLOC TLOC
|
4
4
|
|
5
5
|
options:
|
6
6
|
-h, --help
|
@@ -10,6 +10,7 @@ options:
|
|
10
10
|
-p, --plot Enable PDF plots file generation per mixture.
|
11
11
|
-w, --wav Generate WAV files per mixture.
|
12
12
|
-s, --summary Enable summary files generation.
|
13
|
+
-n, --num_process NCPU Number of parallel processes to use [default: auto]
|
13
14
|
-e ASR, --asr-method ASR ASR method: deepgram, google, aixplain_whisper, whisper, or sensory. [default: none]
|
14
15
|
-m MODEL, --model ASR model name used in some ASR methods. [default: tiny]
|
15
16
|
|
@@ -154,8 +155,8 @@ def snr(clean_speech, processed_speech, sample_rate):
|
|
154
155
|
signal_energy = np.sum(np.square(clean_frame))
|
155
156
|
noise_energy = np.sum(np.square(clean_frame - processed_frame))
|
156
157
|
segmental_snr[frame_count] = 10 * np.log10(signal_energy / (noise_energy + eps) + eps)
|
157
|
-
segmental_snr[frame_count] =
|
158
|
-
segmental_snr[frame_count] =
|
158
|
+
segmental_snr[frame_count] = max(segmental_snr[frame_count], min_snr)
|
159
|
+
segmental_snr[frame_count] = min(segmental_snr[frame_count], max_snr)
|
159
160
|
|
160
161
|
start = start + skip_rate
|
161
162
|
|
@@ -697,11 +698,14 @@ def plot_e_predict_truth(predict: np.ndarray,
|
|
697
698
|
|
698
699
|
|
699
700
|
def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
|
701
|
+
import pickle
|
700
702
|
from os.path import basename
|
701
703
|
from os.path import join
|
702
704
|
from os.path import splitext
|
703
705
|
|
704
706
|
import h5py
|
707
|
+
import mgzip
|
708
|
+
from matplotlib.backends.backend_pdf import PdfPages
|
705
709
|
from numpy import inf
|
706
710
|
from pystoi import stoi
|
707
711
|
|
@@ -718,7 +722,7 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
718
722
|
from sonusai.utils import reshape_outputs
|
719
723
|
from sonusai.utils import stack_complex
|
720
724
|
from sonusai.utils import unstack_complex
|
721
|
-
from sonusai.utils import
|
725
|
+
from sonusai.utils import write_audio
|
722
726
|
|
723
727
|
mixdb = MP_GLOBAL.mixdb
|
724
728
|
predict_location = MP_GLOBAL.predict_location
|
@@ -800,8 +804,12 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
800
804
|
mixture = mixture[0:-trim_t]
|
801
805
|
truth_f = truth_f[0:-trim_f, :]
|
802
806
|
elif predict.shape[0] > target_f.shape[0]:
|
803
|
-
|
804
|
-
f'
|
807
|
+
logger.debug(
|
808
|
+
f'Warning: prediction has more frames than true mixture {predict.shape[0]} vs {truth_f.shape[0]}')
|
809
|
+
trim_f = predict.shape[0] - target_f.shape[0]
|
810
|
+
predict = predict[0:-trim_f, :]
|
811
|
+
# raise SonusAIError(
|
812
|
+
# f'Error: prediction has more frames than true mixture {predict.shape[0]} vs {truth_f.shape[0]}')
|
805
813
|
|
806
814
|
# 3) Extraction - format proper complex and wav estimates and truth (unstack, uncompress, inv tf, etc.)
|
807
815
|
if truth_est_mode:
|
@@ -883,13 +891,9 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
883
891
|
wer_tge = float('nan')
|
884
892
|
wer_pi = float('nan')
|
885
893
|
else:
|
886
|
-
asr_tt = MP_GLOBAL.mixdb.
|
894
|
+
asr_tt = MP_GLOBAL.mixdb.mixture_speech_metadata(mixid, 'text')[0] # ignore mixup
|
887
895
|
if asr_tt is None:
|
888
896
|
asr_tt = calc_asr(target, engine=asr_method, whisper_model_name=asr_model_name).text # target truth
|
889
|
-
# if MP_GLOBAL.mixdb.asr_manifests:
|
890
|
-
# asr_tt = MP_GLOBAL.mixdb.mixture_asr_data(mixid)[0] # ignore mixup
|
891
|
-
# else:
|
892
|
-
# asr_tt = calc_asr(target, engine=asr_method, whisper_model_name=asr_model_name).text # target truth
|
893
897
|
|
894
898
|
if asr_tt:
|
895
899
|
asr_mx = calc_asr(mixture, engine=asr_method, whisper_model_name=asr_model_name).text
|
@@ -957,10 +961,7 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
957
961
|
print(f'Noise path: {mixdb.noise_file(ni).name}', file=f)
|
958
962
|
if asr_method != 'none':
|
959
963
|
print(f'ASR method: {asr_method} and whisper model (if used): {asr_model_name}', file=f)
|
960
|
-
|
961
|
-
print(f'ASR truth from metadata: {asr_tt}', file=f)
|
962
|
-
else:
|
963
|
-
print(f'ASR truth from wer method: {asr_tt}', file=f)
|
964
|
+
print(f'ASR truth: {asr_tt}', file=f)
|
964
965
|
print(f'ASR result for mixture: {asr_mx}', file=f)
|
965
966
|
print(f'ASR result for prediction: {asr_tge}', file=f)
|
966
967
|
|
@@ -968,12 +969,12 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
968
969
|
|
969
970
|
# 7) write wav files
|
970
971
|
if enable_wav:
|
971
|
-
|
972
|
-
|
973
|
-
#
|
974
|
-
|
975
|
-
|
976
|
-
|
972
|
+
write_audio(name=base_name + '_mixture.wav', audio=float_to_int16(mixture))
|
973
|
+
write_audio(name=base_name + '_target.wav', audio=float_to_int16(target))
|
974
|
+
# write_audio(name=base_name + '_target_fi.wav', audio=float_to_int16(target_fi))
|
975
|
+
write_audio(name=base_name + '_noise.wav', audio=float_to_int16(noise))
|
976
|
+
write_audio(name=base_name + '_target_est.wav', audio=float_to_int16(target_est_wav))
|
977
|
+
write_audio(name=base_name + '_noise_est.wav', audio=float_to_int16(noise_est_wav))
|
977
978
|
|
978
979
|
# debug code to test for perfect reconstruction of the extraction method
|
979
980
|
# note both 75% olsa-hanns and 50% olsa-hann modes checked to have perfect reconstruction
|
@@ -984,7 +985,6 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
984
985
|
|
985
986
|
# 8) Write out plot file
|
986
987
|
if enable_plot:
|
987
|
-
from matplotlib.backends.backend_pdf import PdfPages
|
988
988
|
plot_name = base_name + '_metric_spenh.pdf'
|
989
989
|
|
990
990
|
# Reshape feature to eliminate overlap redundancy for easier to understand spectrogram view
|
@@ -1015,12 +1015,15 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
1015
1015
|
tfunc_name = tfunc_name + ' (db)'
|
1016
1016
|
|
1017
1017
|
mixspec = 20 * np.log10(abs(mixture_f) + np.finfo(np.float32).eps)
|
1018
|
-
|
1019
|
-
|
1020
|
-
|
1021
|
-
|
1022
|
-
|
1023
|
-
|
1018
|
+
fig_obj = plot_mixpred(mixture=mixture,
|
1019
|
+
mixture_f=mixspec,
|
1020
|
+
target=target,
|
1021
|
+
feature=feat_sgram,
|
1022
|
+
predict=predplot,
|
1023
|
+
tp_title=tfunc_name)
|
1024
|
+
pdf.savefig(fig_obj)
|
1025
|
+
with mgzip.open(base_name + '_metric_spenh_fig1.mfigz', 'wb') as f:
|
1026
|
+
pickle.dump(fig_obj, f)
|
1024
1027
|
|
1025
1028
|
# ----- page 2, plot unmapped predict, opt truth reconstructed and line plots of mean-over-f
|
1026
1029
|
# pdf.savefig(plot_pdb_predtruth(predict=pred_snr_f, tp_title='predict snr_f (db)'))
|
@@ -1029,22 +1032,28 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
1029
1032
|
tg_spec = 20 * np.log10(abs(target_f) + np.finfo(np.float32).eps)
|
1030
1033
|
tg_est_spec = 20 * np.log10(abs(predict_complex) + np.finfo(np.float32).eps)
|
1031
1034
|
# n_spec = np.reshape(n_spec,(n_spec.shape[0] * n_spec.shape[1], n_spec.shape[2]))
|
1032
|
-
|
1033
|
-
|
1034
|
-
|
1035
|
-
|
1036
|
-
|
1037
|
-
|
1035
|
+
fig_obj = plot_e_predict_truth(predict=tg_est_spec,
|
1036
|
+
predict_wav=target_est_wav,
|
1037
|
+
truth_f=tg_spec,
|
1038
|
+
truth_wav=target_fi,
|
1039
|
+
metric=np.vstack((lerr_tg_frame, phd_frame)).T,
|
1040
|
+
tp_title='speech estimate')
|
1041
|
+
pdf.savefig(fig_obj)
|
1042
|
+
with mgzip.open(base_name + '_metric_spenh_fig2.mfigz', 'wb') as f:
|
1043
|
+
pickle.dump(fig_obj, f)
|
1038
1044
|
|
1039
1045
|
# page 4 noise extraction
|
1040
1046
|
n_spec = 20 * np.log10(abs(noise_f) + np.finfo(np.float32).eps)
|
1041
1047
|
n_est_spec = 20 * np.log10(abs(noise_est_complex) + np.finfo(np.float32).eps)
|
1042
|
-
|
1043
|
-
|
1044
|
-
|
1045
|
-
|
1046
|
-
|
1047
|
-
|
1048
|
+
fig_obj = plot_e_predict_truth(predict=n_est_spec,
|
1049
|
+
predict_wav=noise_est_wav,
|
1050
|
+
truth_f=n_spec,
|
1051
|
+
truth_wav=noise_fi,
|
1052
|
+
metric=lerr_n_frame,
|
1053
|
+
tp_title='noise estimate')
|
1054
|
+
pdf.savefig(fig_obj)
|
1055
|
+
with mgzip.open(base_name + '_metric_spenh_fig4.mfigz', 'wb') as f:
|
1056
|
+
pickle.dump(fig_obj, f)
|
1048
1057
|
|
1049
1058
|
# Plot error waveforms
|
1050
1059
|
# tg_err_wav = target_fi - target_est_wav
|
@@ -1072,6 +1081,7 @@ def main():
|
|
1072
1081
|
enable_wav = args['--wav']
|
1073
1082
|
enable_summary = args['--summary']
|
1074
1083
|
predict_location = args['PLOC']
|
1084
|
+
num_proc = args['--num_process']
|
1075
1085
|
truth_location = args['TLOC']
|
1076
1086
|
|
1077
1087
|
import glob
|
@@ -1080,6 +1090,7 @@ def main():
|
|
1080
1090
|
from os.path import join
|
1081
1091
|
from os.path import split
|
1082
1092
|
|
1093
|
+
import psutil
|
1083
1094
|
from tqdm import tqdm
|
1084
1095
|
|
1085
1096
|
from sonusai import create_file_handler
|
@@ -1153,13 +1164,17 @@ def main():
|
|
1153
1164
|
fnb = 'metric_spenh_fwhsp_' + asr_model_name + '_'
|
1154
1165
|
logger.info(f'ASR enabled with method {asr_method} and whisper model {asr_model_name}')
|
1155
1166
|
enable_asr_warmup = True
|
1167
|
+
elif asr_method == 'sensory':
|
1168
|
+
fnb = 'metric_spenh_snsr_' + asr_model_name + '_'
|
1169
|
+
logger.info(f'ASR enabled with method {asr_method} and model {asr_model_name}')
|
1170
|
+
enable_asr_warmup = True
|
1156
1171
|
else:
|
1157
1172
|
logger.error(f'Unrecognized ASR method: {asr_method}')
|
1158
1173
|
return
|
1159
1174
|
|
1160
1175
|
if enable_asr_warmup:
|
1161
|
-
|
1162
|
-
audio = read_audio(
|
1176
|
+
default_speech = split(DEFAULT_NOISE)[0] + '/speech_ma01_01.wav'
|
1177
|
+
audio = read_audio(default_speech)
|
1163
1178
|
logger.info(f'Warming up asr method, note for cloud service this could take up to a few min ...')
|
1164
1179
|
asr_chk = calc_asr(audio, engine=asr_method, whisper_model_name=asr_model_name)
|
1165
1180
|
logger.info(f'Warmup completed, results {asr_chk}')
|
@@ -1173,10 +1188,25 @@ def main():
|
|
1173
1188
|
MP_GLOBAL.asr_method = asr_method
|
1174
1189
|
MP_GLOBAL.asr_model_name = asr_model_name
|
1175
1190
|
|
1191
|
+
num_cpu = psutil.cpu_count()
|
1192
|
+
cpu_percent = psutil.cpu_percent(interval=1)
|
1193
|
+
logger.info(f"#CPUs: {num_cpu}, current CPU utilization: {cpu_percent}%")
|
1194
|
+
logger.info(f"Memory utilization: {psutil.virtual_memory().percent}%")
|
1195
|
+
if num_proc == 'auto':
|
1196
|
+
use_cpu = int(num_cpu * (0.9 - cpu_percent / 100)) # default use 80% of available cpus
|
1197
|
+
elif num_proc == 'None':
|
1198
|
+
use_cpu = None
|
1199
|
+
else:
|
1200
|
+
use_cpu = min(max(int(num_proc), 1), num_cpu)
|
1201
|
+
|
1176
1202
|
# Individual mixtures use pandas print, set precision to 2 decimal places
|
1177
1203
|
# pd.set_option('float_format', '{:.2f}'.format)
|
1204
|
+
logger.info(f"Calculating metrics for {len(mixids)} mixtures using {use_cpu} parallel processes ...")
|
1178
1205
|
progress = tqdm(total=len(mixids), desc='calc_metric_spenh')
|
1179
|
-
|
1206
|
+
if use_cpu is None:
|
1207
|
+
all_metrics_tables = pp_tqdm_imap(_process_mixture, mixids, progress=progress, no_par=True)
|
1208
|
+
else:
|
1209
|
+
all_metrics_tables = pp_tqdm_imap(_process_mixture, mixids, progress=progress, num_cpus=use_cpu)
|
1180
1210
|
progress.close()
|
1181
1211
|
|
1182
1212
|
all_metrics_table_1 = pd.concat([item[0] for item in all_metrics_tables])
|
@@ -1209,9 +1239,9 @@ def main():
|
|
1209
1239
|
for i in range(len(mtab_snr_summary)):
|
1210
1240
|
if mtab_snr_summary['MXWER'].iloc[i] == 0.0:
|
1211
1241
|
if mtab_snr_summary['WER'].iloc[i] == 0.0:
|
1212
|
-
mtab_snr_summary['WERi%'].iloc[i] = 0.0
|
1242
|
+
mtab_snr_summary.iloc[i, 6] = 0.0 # mtab_snr_summary['WERi%'].iloc[i] = 0.0
|
1213
1243
|
else:
|
1214
|
-
mtab_snr_summary['WERi%'].iloc[i] = -999.0
|
1244
|
+
mtab_snr_summary.iloc[i, 6] = -999.0 # mtab_snr_summary['WERi%'].iloc[i] = -999.0
|
1215
1245
|
else:
|
1216
1246
|
if ~np.isnan(mtab_snr_summary['WER'].iloc[i]) and ~np.isnan(mtab_snr_summary['MXWER'].iloc[i]):
|
1217
1247
|
# update WERi% in 6th col
|
@@ -1240,7 +1270,6 @@ def main():
|
|
1240
1270
|
if num_mix > 1:
|
1241
1271
|
# Print pandas data to files using precision to 2 decimals
|
1242
1272
|
# pd.set_option('float_format', '{:.2f}'.format)
|
1243
|
-
csp = 0
|
1244
1273
|
|
1245
1274
|
if not truth_est_mode:
|
1246
1275
|
ofname = join(predict_location, fnb + 'summary.txt')
|
@@ -1280,9 +1309,9 @@ def main():
|
|
1280
1309
|
|
1281
1310
|
# Write summary to .csv file
|
1282
1311
|
if not truth_est_mode:
|
1283
|
-
csv_name = join(predict_location, fnb + 'summary.csv')
|
1312
|
+
csv_name = str(join(predict_location, fnb + 'summary.csv'))
|
1284
1313
|
else:
|
1285
|
-
csv_name = join(predict_location, fnb + '
|
1314
|
+
csv_name = str(join(predict_location, fnb + 'truest_summary.csv'))
|
1286
1315
|
header_args = {
|
1287
1316
|
'mode': 'a',
|
1288
1317
|
'encoding': 'utf-8',
|
@@ -1315,16 +1344,16 @@ def main():
|
|
1315
1344
|
pd.DataFrame([label]).to_csv(csv_name, **header_args)
|
1316
1345
|
|
1317
1346
|
if not truth_est_mode:
|
1318
|
-
csv_name = join(predict_location, fnb + 'list.csv')
|
1347
|
+
csv_name = str(join(predict_location, fnb + 'list.csv'))
|
1319
1348
|
else:
|
1320
|
-
csv_name = join(predict_location, fnb + '
|
1349
|
+
csv_name = str(join(predict_location, fnb + 'truest_list.csv'))
|
1321
1350
|
pd.DataFrame(['Speech enhancement metrics list:']).to_csv(csv_name, header=False, index=False) # open as write
|
1322
1351
|
all_metrics_table_1.round(2).to_csv(csv_name, **table_args)
|
1323
1352
|
|
1324
1353
|
if not truth_est_mode:
|
1325
|
-
csv_name = join(predict_location, fnb + 'estats_list.csv')
|
1354
|
+
csv_name = str(join(predict_location, fnb + 'estats_list.csv'))
|
1326
1355
|
else:
|
1327
|
-
csv_name = join(predict_location, fnb + '
|
1356
|
+
csv_name = str(join(predict_location, fnb + 'truest_estats_list.csv'))
|
1328
1357
|
pd.DataFrame(['Extraction statistics list:']).to_csv(csv_name, header=False, index=False) # open as write
|
1329
1358
|
all_metrics_table_2.round(2).to_csv(csv_name, **table_args)
|
1330
1359
|
|
@@ -255,30 +255,6 @@ The 'truth_settings' parameter specifies the following:
|
|
255
255
|
""" + get_truth_functions() + default
|
256
256
|
|
257
257
|
|
258
|
-
def doc_asr_manifest() -> str:
|
259
|
-
default = f"\nDefault value: {get_default_config()['asr_manifest']}"
|
260
|
-
return """
|
261
|
-
'asr_manifest' is a mixture database configuration parameter that defines an
|
262
|
-
optional ASR manifest.
|
263
|
-
|
264
|
-
The parameter takes a list of manifest files to be used to populate ASR data
|
265
|
-
per target. Each line of the manifest should be in the following format:
|
266
|
-
|
267
|
-
{"audio_filepath": "/path/to/audio.wav", "text": "the transcription of the utterance", "duration": 23.147}
|
268
|
-
|
269
|
-
The audio_filepath field should provide an absolute path to the audio file corresponding
|
270
|
-
to the utterance. The text field should contain the full transcript for the utterance,
|
271
|
-
and the duration field should reflect the duration of the utterance in seconds.
|
272
|
-
|
273
|
-
Each entry in the manifest (describing one audio file) should be bordered by '{' and '}'
|
274
|
-
and must be contained on one line. The fields that describe the file should be separated
|
275
|
-
by commas, and have the form "field_name": value, as shown above.
|
276
|
-
|
277
|
-
Since the manifest specifies the path for each utterance, the audio files do not have to be
|
278
|
-
located in the same directory as the manifest, or even in any specific directory structure.
|
279
|
-
""" + default
|
280
|
-
|
281
|
-
|
282
258
|
def doc_augmentations() -> str:
|
283
259
|
return """
|
284
260
|
Augmentation Rules
|
@@ -0,0 +1,146 @@
|
|
1
|
+
# Generate mixdb metrics based on metrics listed in config.yml
|
2
|
+
|
3
|
+
|
4
|
+
class MixtureMetrics:
|
5
|
+
@property
|
6
|
+
def mxsnr(self):
|
7
|
+
...
|
8
|
+
|
9
|
+
@property
|
10
|
+
def mxssnravg(self):
|
11
|
+
...
|
12
|
+
|
13
|
+
@property
|
14
|
+
def mxssnrstd(self):
|
15
|
+
...
|
16
|
+
|
17
|
+
@property
|
18
|
+
def mxssnrdavg(self):
|
19
|
+
...
|
20
|
+
|
21
|
+
@property
|
22
|
+
def mxssnrdstd(self):
|
23
|
+
...
|
24
|
+
|
25
|
+
@property
|
26
|
+
def mxpesq(self):
|
27
|
+
...
|
28
|
+
|
29
|
+
@property
|
30
|
+
def mxwsdr(self):
|
31
|
+
...
|
32
|
+
|
33
|
+
@property
|
34
|
+
def mxpd(self):
|
35
|
+
...
|
36
|
+
|
37
|
+
@property
|
38
|
+
def mxstoi(self):
|
39
|
+
...
|
40
|
+
|
41
|
+
@property
|
42
|
+
def mxcsig(self):
|
43
|
+
...
|
44
|
+
|
45
|
+
@property
|
46
|
+
def mxcbak(self):
|
47
|
+
...
|
48
|
+
|
49
|
+
@property
|
50
|
+
def mxcovl(self):
|
51
|
+
...
|
52
|
+
|
53
|
+
def mxwer(self, engine: str, model: str):
|
54
|
+
...
|
55
|
+
|
56
|
+
@property
|
57
|
+
def tdco(self):
|
58
|
+
...
|
59
|
+
|
60
|
+
@property
|
61
|
+
def tmin(self):
|
62
|
+
...
|
63
|
+
|
64
|
+
@property
|
65
|
+
def tmax(self):
|
66
|
+
...
|
67
|
+
|
68
|
+
@property
|
69
|
+
def tpkdb(self):
|
70
|
+
...
|
71
|
+
|
72
|
+
@property
|
73
|
+
def tlrms(self):
|
74
|
+
...
|
75
|
+
|
76
|
+
@property
|
77
|
+
def tpkr(self):
|
78
|
+
...
|
79
|
+
|
80
|
+
@property
|
81
|
+
def ttr(self):
|
82
|
+
...
|
83
|
+
|
84
|
+
@property
|
85
|
+
def tcr(self):
|
86
|
+
...
|
87
|
+
|
88
|
+
@property
|
89
|
+
def tfl(self):
|
90
|
+
...
|
91
|
+
|
92
|
+
@property
|
93
|
+
def tpkc(self):
|
94
|
+
...
|
95
|
+
|
96
|
+
@property
|
97
|
+
def ndco(self):
|
98
|
+
...
|
99
|
+
|
100
|
+
@property
|
101
|
+
def nmin(self):
|
102
|
+
...
|
103
|
+
|
104
|
+
@property
|
105
|
+
def nmax(self):
|
106
|
+
...
|
107
|
+
|
108
|
+
@property
|
109
|
+
def npkdb(self):
|
110
|
+
...
|
111
|
+
|
112
|
+
@property
|
113
|
+
def nlrms(self):
|
114
|
+
...
|
115
|
+
|
116
|
+
@property
|
117
|
+
def npkr(self):
|
118
|
+
...
|
119
|
+
|
120
|
+
@property
|
121
|
+
def ntr(self):
|
122
|
+
...
|
123
|
+
|
124
|
+
@property
|
125
|
+
def ncr(self):
|
126
|
+
...
|
127
|
+
|
128
|
+
@property
|
129
|
+
def nfl(self):
|
130
|
+
...
|
131
|
+
|
132
|
+
@property
|
133
|
+
def npkc(self):
|
134
|
+
...
|
135
|
+
|
136
|
+
@property
|
137
|
+
def sedavg(self):
|
138
|
+
...
|
139
|
+
|
140
|
+
@property
|
141
|
+
def sedcnt(self):
|
142
|
+
...
|
143
|
+
|
144
|
+
@property
|
145
|
+
def sedtopn(self):
|
146
|
+
...
|
@@ -174,7 +174,6 @@ def genmixdb(location: str,
|
|
174
174
|
from sonusai.mixture import initialize_db
|
175
175
|
from sonusai.mixture import load_config
|
176
176
|
from sonusai.mixture import log_duration_and_sizes
|
177
|
-
from sonusai.mixture import populate_asr_manifest_table
|
178
177
|
from sonusai.mixture import populate_class_label_table
|
179
178
|
from sonusai.mixture import populate_class_weights_threshold_table
|
180
179
|
from sonusai.mixture import populate_impulse_response_file_table
|
@@ -195,7 +194,6 @@ def genmixdb(location: str,
|
|
195
194
|
mixdb = MixtureDatabase(location=location, test=test)
|
196
195
|
|
197
196
|
populate_top_table(location, config, test)
|
198
|
-
populate_asr_manifest_table(location, config, test)
|
199
197
|
populate_class_label_table(location, config, test)
|
200
198
|
populate_class_weights_threshold_table(location, config, test)
|
201
199
|
populate_spectral_mask_table(location, config, test)
|
@@ -88,7 +88,6 @@ from .feature import get_feature_from_audio
|
|
88
88
|
from .generation import generate_mixtures
|
89
89
|
from .generation import get_all_snrs_from_config
|
90
90
|
from .generation import initialize_db
|
91
|
-
from .generation import populate_asr_manifest_table
|
92
91
|
from .generation import populate_class_label_table
|
93
92
|
from .generation import populate_class_weights_threshold_table
|
94
93
|
from .generation import populate_impulse_response_file_table
|
@@ -1,5 +1,4 @@
|
|
1
1
|
from dataclasses import dataclass
|
2
|
-
from dataclasses import field
|
3
2
|
from typing import Optional
|
4
3
|
from typing import TypeAlias
|
5
4
|
|
@@ -135,13 +134,7 @@ class UniversalSNR(float):
|
|
135
134
|
return self._is_random
|
136
135
|
|
137
136
|
|
138
|
-
|
139
|
-
# class UniversalSNR:
|
140
|
-
# is_random: bool
|
141
|
-
# value: float
|
142
|
-
#
|
143
|
-
# def __lt__(self, other) -> bool:
|
144
|
-
# return self.value < other.value
|
137
|
+
Speaker: TypeAlias = dict[str, str]
|
145
138
|
|
146
139
|
|
147
140
|
@dataclass
|
@@ -151,6 +144,7 @@ class TargetFile(DataClassSonusAIMixin):
|
|
151
144
|
truth_settings: TruthSettings
|
152
145
|
class_balancing_augmentation: Optional[AugmentationRule] = None
|
153
146
|
level_type: Optional[str] = None
|
147
|
+
speaker_id: Optional[int] = None
|
154
148
|
|
155
149
|
@property
|
156
150
|
def duration(self) -> float:
|
@@ -317,7 +311,6 @@ class FeatureGeneratorInfo:
|
|
317
311
|
|
318
312
|
@dataclass
|
319
313
|
class MixtureDatabaseConfig(DataClassSonusAIMixin):
|
320
|
-
asr_manifest: list[str] = field(default_factory=list)
|
321
314
|
class_balancing: Optional[bool] = False
|
322
315
|
class_labels: Optional[list[str]] = None
|
323
316
|
class_weights_threshold: Optional[list[float]] = None
|