sonusai 0.17.2__tar.gz → 0.18.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. {sonusai-0.17.2 → sonusai-0.18.0}/PKG-INFO +4 -1
  2. {sonusai-0.17.2 → sonusai-0.18.0}/pyproject.toml +6 -2
  3. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/__init__.py +0 -1
  4. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/audiofe.py +3 -3
  5. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/calc_metric_spenh.py +81 -52
  6. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/doc/doc.py +0 -24
  7. sonusai-0.18.0/sonusai/genmetrics.py +146 -0
  8. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/genmixdb.py +0 -2
  9. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/__init__.py +0 -1
  10. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/constants.py +0 -1
  11. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/datatypes.py +2 -9
  12. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/generation.py +136 -38
  13. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/helpers.py +58 -1
  14. sonusai-0.18.0/sonusai/mixture/mapped_snr_f.py +100 -0
  15. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/mixdb.py +293 -170
  16. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/sox_augmentation.py +3 -0
  17. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/tokenized_shell_vars.py +8 -1
  18. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mkwav.py +4 -4
  19. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/onnx_predict.py +2 -2
  20. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/post_spenh_targetf.py +2 -2
  21. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/speech/textgrid.py +6 -24
  22. sonusai-0.17.2/sonusai/speech/voxceleb2.py → sonusai-0.18.0/sonusai/speech/voxceleb.py +19 -3
  23. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/__init__.py +1 -1
  24. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/asr_functions/aaware_whisper.py +2 -2
  25. sonusai-0.17.2/sonusai/utils/wave.py → sonusai-0.18.0/sonusai/utils/write_audio.py +2 -2
  26. sonusai-0.17.2/sonusai/mixture/mapped_snr_f.py +0 -53
  27. sonusai-0.17.2/sonusai/mixture/speaker_metadata.py +0 -35
  28. sonusai-0.17.2/sonusai/mkmanifest.py +0 -209
  29. sonusai-0.17.2/sonusai/utils/asr_manifest_functions/__init__.py +0 -6
  30. sonusai-0.17.2/sonusai/utils/asr_manifest_functions/data.py +0 -1
  31. sonusai-0.17.2/sonusai/utils/asr_manifest_functions/librispeech.py +0 -46
  32. sonusai-0.17.2/sonusai/utils/asr_manifest_functions/mcgill_speech.py +0 -29
  33. sonusai-0.17.2/sonusai/utils/asr_manifest_functions/vctk_noisy_speech.py +0 -66
  34. {sonusai-0.17.2 → sonusai-0.18.0}/README.rst +0 -0
  35. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/aawscd_probwrite.py +0 -0
  36. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/data/__init__.py +0 -0
  37. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/data/genmixdb.yml +0 -0
  38. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/data/speech_ma01_01.wav +0 -0
  39. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/data/whitenoise.wav +0 -0
  40. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/doc/__init__.py +0 -0
  41. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/doc.py +0 -0
  42. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/genft.py +0 -0
  43. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/genmix.py +0 -0
  44. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/gentcst.py +0 -0
  45. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/lsdb.py +0 -0
  46. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/main.py +0 -0
  47. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/metrics/__init__.py +0 -0
  48. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/metrics/calc_class_weights.py +0 -0
  49. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/metrics/calc_optimal_thresholds.py +0 -0
  50. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/metrics/calc_pcm.py +0 -0
  51. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/metrics/calc_pesq.py +0 -0
  52. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/metrics/calc_sa_sdr.py +0 -0
  53. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/metrics/calc_sample_weights.py +0 -0
  54. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/metrics/calc_wer.py +0 -0
  55. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/metrics/calc_wsdr.py +0 -0
  56. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/metrics/class_summary.py +0 -0
  57. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/metrics/confusion_matrix_summary.py +0 -0
  58. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/metrics/one_hot.py +0 -0
  59. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/metrics/snr_summary.py +0 -0
  60. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/audio.py +0 -0
  61. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/augmentation.py +0 -0
  62. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/class_count.py +0 -0
  63. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/config.py +0 -0
  64. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/eq_rule_is_valid.py +0 -0
  65. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/feature.py +0 -0
  66. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/log_duration_and_sizes.py +0 -0
  67. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/soundfile_audio.py +0 -0
  68. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/sox_audio.py +0 -0
  69. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/spectral_mask.py +0 -0
  70. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/target_class_balancing.py +0 -0
  71. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/targets.py +0 -0
  72. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/torchaudio_audio.py +0 -0
  73. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/torchaudio_augmentation.py +0 -0
  74. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/truth.py +0 -0
  75. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/truth_functions/__init__.py +0 -0
  76. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/truth_functions/crm.py +0 -0
  77. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/truth_functions/data.py +0 -0
  78. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/truth_functions/energy.py +0 -0
  79. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/truth_functions/file.py +0 -0
  80. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/truth_functions/phoneme.py +0 -0
  81. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/truth_functions/sed.py +0 -0
  82. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/mixture/truth_functions/target.py +0 -0
  83. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/plot.py +0 -0
  84. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/queries/__init__.py +0 -0
  85. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/queries/queries.py +0 -0
  86. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/speech/__init__.py +0 -0
  87. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/speech/l2arctic.py +0 -0
  88. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/speech/librispeech.py +0 -0
  89. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/speech/mcgill.py +0 -0
  90. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/speech/timit.py +0 -0
  91. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/speech/types.py +0 -0
  92. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/speech/vctk.py +0 -0
  93. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/summarize_metric_spenh.py +0 -0
  94. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/tplot.py +0 -0
  95. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/asl_p56.py +0 -0
  96. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/asr.py +0 -0
  97. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/asr_functions/__init__.py +0 -0
  98. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/audio_devices.py +0 -0
  99. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/braced_glob.py +0 -0
  100. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/calculate_input_shape.py +0 -0
  101. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/convert_string_to_number.py +0 -0
  102. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/create_timestamp.py +0 -0
  103. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/create_ts_name.py +0 -0
  104. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/dataclass_from_dict.py +0 -0
  105. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/db.py +0 -0
  106. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/docstring.py +0 -0
  107. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/energy_f.py +0 -0
  108. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/engineering_number.py +0 -0
  109. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/get_frames_per_batch.py +0 -0
  110. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/get_label_names.py +0 -0
  111. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/grouper.py +0 -0
  112. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/human_readable_size.py +0 -0
  113. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/max_text_width.py +0 -0
  114. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/model_utils.py +0 -0
  115. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/numeric_conversion.py +0 -0
  116. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/onnx_utils.py +0 -0
  117. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/parallel.py +0 -0
  118. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/path_info.py +0 -0
  119. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/print_mixture_details.py +0 -0
  120. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/ranges.py +0 -0
  121. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/read_mixture_data.py +0 -0
  122. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/read_predict_data.py +0 -0
  123. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/reshape.py +0 -0
  124. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/seconds_to_hms.py +0 -0
  125. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/stacked_complex.py +0 -0
  126. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/stratified_shuffle_split.py +0 -0
  127. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/utils/yes_or_no.py +0 -0
  128. {sonusai-0.17.2 → sonusai-0.18.0}/sonusai/vars.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonusai
3
- Version: 0.17.2
3
+ Version: 0.18.0
4
4
  Summary: Framework for building deep neural network models for sound, speech, and voice AI
5
5
  Home-page: https://aaware.com
6
6
  License: GPL-3.0-only
@@ -21,12 +21,15 @@ Requires-Dist: h5py (>=3.11.0,<4.0.0)
21
21
  Requires-Dist: jiwer (>=3.0.3,<4.0.0)
22
22
  Requires-Dist: librosa (>=0.10.1,<0.11.0)
23
23
  Requires-Dist: matplotlib (>=3.8.0,<4.0.0)
24
+ Requires-Dist: mgzip (>=0.2.1,<0.3.0)
25
+ Requires-Dist: numpy (>=1.26.4,<2.0.0)
24
26
  Requires-Dist: onnx (>=1.14.1,<2.0.0)
25
27
  Requires-Dist: onnxruntime (>=1.16.1,<2.0.0)
26
28
  Requires-Dist: paho-mqtt (>=2.0.0,<3.0.0)
27
29
  Requires-Dist: pandas (>=2.1.1,<3.0.0)
28
30
  Requires-Dist: pesq (>=0.0.4,<0.0.5)
29
31
  Requires-Dist: praatio (>=6.2.0,<7.0.0)
32
+ Requires-Dist: psutil (>=5,<6)
30
33
  Requires-Dist: pyaaware (>=1.5.7,<2.0.0)
31
34
  Requires-Dist: pyaudio (>=0.2.14,<0.3.0)
32
35
  Requires-Dist: pydub (>=0.25.1,<0.26.0)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sonusai"
3
- version = "0.17.2"
3
+ version = "0.18.0"
4
4
  description = "Framework for building deep neural network models for sound, speech, and voice AI"
5
5
  authors = ["Chris Eddington <chris@aaware.com>", "Jason Calderwood <jason@aaware.com>"]
6
6
  maintainers = ["Chris Eddington <chris@aaware.com>", "Jason Calderwood <jason@aaware.com>"]
@@ -21,12 +21,15 @@ h5py = "^3.11.0"
21
21
  jiwer = "^3.0.3"
22
22
  librosa = "^0.10.1"
23
23
  matplotlib = "^3.8.0"
24
+ mgzip = "^0.2.1"
25
+ numpy = "^1.26.4"
24
26
  onnx = "^1.14.1"
25
27
  onnxruntime = "^1.16.1"
26
28
  paho-mqtt = "^2.0.0"
27
29
  pandas = "^2.1.1"
28
30
  pesq = "^0.0.4"
29
31
  praatio = "^6.2.0"
32
+ psutil = "^5"
30
33
  pyaaware = "^1.5.7"
31
34
  pyaudio = "^0.2.14"
32
35
  pydub = "^0.25.1"
@@ -47,7 +50,8 @@ mypy = "^1.6.0"
47
50
  mypy-extensions = "^1.0.0"
48
51
  pytest = "^8.1.1"
49
52
  sonusai-asr-cloud = "^0.1.0"
50
- sonusai-torchl = "^0.1.0"
53
+ sonusai-asr-sensory = "^0.1.0"
54
+ sonusai-torchl = "^0.3.0"
51
55
  types-pyyaml = "^6.0.12.12"
52
56
  types-requests = "^2.31.0.8"
53
57
 
@@ -14,7 +14,6 @@ commands_doc = """
14
14
  genmixdb Generate a mixture database
15
15
  gentcst Generate target configuration from a subdirectory tree
16
16
  lsdb List information about a mixture database
17
- mkmanifest Make ASR manifest JSON file
18
17
  mkwav Make WAV files from a mixture database
19
18
  onnx_predict Run ONNX predict on a trained model
20
19
  plot Plot mixture data
@@ -86,7 +86,7 @@ def main() -> None:
86
86
  from sonusai.utils import create_timestamp
87
87
  from sonusai.utils import get_input_devices
88
88
  from sonusai.utils import load_ort_session
89
- from sonusai.utils import write_wav
89
+ from sonusai.utils import write_audio
90
90
 
91
91
  ts = create_timestamp()
92
92
  capture_name = f'audiofe_capture_{ts}'
@@ -121,7 +121,7 @@ def main() -> None:
121
121
  logger.exception(e)
122
122
  return
123
123
  # Only write if capture from device, not for file input
124
- write_wav(capture_wav, capture_audio, SAMPLE_RATE)
124
+ write_audio(capture_wav, capture_audio, SAMPLE_RATE)
125
125
  logger.info('')
126
126
  logger.info(f'Wrote capture audio with shape {capture_audio.shape} to {capture_wav}')
127
127
 
@@ -175,7 +175,7 @@ def main() -> None:
175
175
  logger.info(f'Wrote predict with shape {predict.shape} to {h5_name}')
176
176
 
177
177
  predict_audio = get_audio_from_feature(feature=predict, feature_mode=feature_mode)
178
- write_wav(predict_wav, predict_audio, SAMPLE_RATE)
178
+ write_audio(predict_wav, predict_audio, SAMPLE_RATE)
179
179
  logger.info(f'Wrote predict audio with shape {predict_audio.shape} to {predict_wav}')
180
180
  if debug:
181
181
  with h5py.File(h5_name, 'a') as f:
@@ -1,6 +1,6 @@
1
1
  """sonusai calc_metric_spenh
2
2
 
3
- usage: calc_metric_spenh [-hvtpws] [-i MIXID] [-e ASR] [-m MODEL] PLOC TLOC
3
+ usage: calc_metric_spenh [-hvtpws] [-i MIXID] [-e ASR] [-m MODEL] [-n NCPU] PLOC TLOC
4
4
 
5
5
  options:
6
6
  -h, --help
@@ -10,6 +10,7 @@ options:
10
10
  -p, --plot Enable PDF plots file generation per mixture.
11
11
  -w, --wav Generate WAV files per mixture.
12
12
  -s, --summary Enable summary files generation.
13
+ -n, --num_process NCPU Number of parallel processes to use [default: auto]
13
14
  -e ASR, --asr-method ASR ASR method: deepgram, google, aixplain_whisper, whisper, or sensory. [default: none]
14
15
  -m MODEL, --model ASR model name used in some ASR methods. [default: tiny]
15
16
 
@@ -154,8 +155,8 @@ def snr(clean_speech, processed_speech, sample_rate):
154
155
  signal_energy = np.sum(np.square(clean_frame))
155
156
  noise_energy = np.sum(np.square(clean_frame - processed_frame))
156
157
  segmental_snr[frame_count] = 10 * np.log10(signal_energy / (noise_energy + eps) + eps)
157
- segmental_snr[frame_count] = np.max(segmental_snr[frame_count], min_snr)
158
- segmental_snr[frame_count] = np.min(segmental_snr[frame_count], max_snr)
158
+ segmental_snr[frame_count] = max(segmental_snr[frame_count], min_snr)
159
+ segmental_snr[frame_count] = min(segmental_snr[frame_count], max_snr)
159
160
 
160
161
  start = start + skip_rate
161
162
 
@@ -697,11 +698,14 @@ def plot_e_predict_truth(predict: np.ndarray,
697
698
 
698
699
 
699
700
  def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
701
+ import pickle
700
702
  from os.path import basename
701
703
  from os.path import join
702
704
  from os.path import splitext
703
705
 
704
706
  import h5py
707
+ import mgzip
708
+ from matplotlib.backends.backend_pdf import PdfPages
705
709
  from numpy import inf
706
710
  from pystoi import stoi
707
711
 
@@ -718,7 +722,7 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
718
722
  from sonusai.utils import reshape_outputs
719
723
  from sonusai.utils import stack_complex
720
724
  from sonusai.utils import unstack_complex
721
- from sonusai.utils import write_wav
725
+ from sonusai.utils import write_audio
722
726
 
723
727
  mixdb = MP_GLOBAL.mixdb
724
728
  predict_location = MP_GLOBAL.predict_location
@@ -800,8 +804,12 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
800
804
  mixture = mixture[0:-trim_t]
801
805
  truth_f = truth_f[0:-trim_f, :]
802
806
  elif predict.shape[0] > target_f.shape[0]:
803
- raise SonusAIError(
804
- f'Error: prediction has more frames than true mixture {predict.shape[0]} vs {truth_f.shape[0]}')
807
+ logger.debug(
808
+ f'Warning: prediction has more frames than true mixture {predict.shape[0]} vs {truth_f.shape[0]}')
809
+ trim_f = predict.shape[0] - target_f.shape[0]
810
+ predict = predict[0:-trim_f, :]
811
+ # raise SonusAIError(
812
+ # f'Error: prediction has more frames than true mixture {predict.shape[0]} vs {truth_f.shape[0]}')
805
813
 
806
814
  # 3) Extraction - format proper complex and wav estimates and truth (unstack, uncompress, inv tf, etc.)
807
815
  if truth_est_mode:
@@ -883,13 +891,9 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
883
891
  wer_tge = float('nan')
884
892
  wer_pi = float('nan')
885
893
  else:
886
- asr_tt = MP_GLOBAL.mixdb.get_speech_metadata(mixid, 'text')[0] # ignore mixup
894
+ asr_tt = MP_GLOBAL.mixdb.mixture_speech_metadata(mixid, 'text')[0] # ignore mixup
887
895
  if asr_tt is None:
888
896
  asr_tt = calc_asr(target, engine=asr_method, whisper_model_name=asr_model_name).text # target truth
889
- # if MP_GLOBAL.mixdb.asr_manifests:
890
- # asr_tt = MP_GLOBAL.mixdb.mixture_asr_data(mixid)[0] # ignore mixup
891
- # else:
892
- # asr_tt = calc_asr(target, engine=asr_method, whisper_model_name=asr_model_name).text # target truth
893
897
 
894
898
  if asr_tt:
895
899
  asr_mx = calc_asr(mixture, engine=asr_method, whisper_model_name=asr_model_name).text
@@ -957,10 +961,7 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
957
961
  print(f'Noise path: {mixdb.noise_file(ni).name}', file=f)
958
962
  if asr_method != 'none':
959
963
  print(f'ASR method: {asr_method} and whisper model (if used): {asr_model_name}', file=f)
960
- if mixdb.asr_manifests:
961
- print(f'ASR truth from metadata: {asr_tt}', file=f)
962
- else:
963
- print(f'ASR truth from wer method: {asr_tt}', file=f)
964
+ print(f'ASR truth: {asr_tt}', file=f)
964
965
  print(f'ASR result for mixture: {asr_mx}', file=f)
965
966
  print(f'ASR result for prediction: {asr_tge}', file=f)
966
967
 
@@ -968,12 +969,12 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
968
969
 
969
970
  # 7) write wav files
970
971
  if enable_wav:
971
- write_wav(name=base_name + '_mixture.wav', audio=float_to_int16(mixture))
972
- write_wav(name=base_name + '_target.wav', audio=float_to_int16(target))
973
- # write_wav(name=base_name + '_target_fi.wav', audio=float_to_int16(target_fi))
974
- write_wav(name=base_name + '_noise.wav', audio=float_to_int16(noise))
975
- write_wav(name=base_name + '_target_est.wav', audio=float_to_int16(target_est_wav))
976
- write_wav(name=base_name + '_noise_est.wav', audio=float_to_int16(noise_est_wav))
972
+ write_audio(name=base_name + '_mixture.wav', audio=float_to_int16(mixture))
973
+ write_audio(name=base_name + '_target.wav', audio=float_to_int16(target))
974
+ # write_audio(name=base_name + '_target_fi.wav', audio=float_to_int16(target_fi))
975
+ write_audio(name=base_name + '_noise.wav', audio=float_to_int16(noise))
976
+ write_audio(name=base_name + '_target_est.wav', audio=float_to_int16(target_est_wav))
977
+ write_audio(name=base_name + '_noise_est.wav', audio=float_to_int16(noise_est_wav))
977
978
 
978
979
  # debug code to test for perfect reconstruction of the extraction method
979
980
  # note both 75% olsa-hanns and 50% olsa-hann modes checked to have perfect reconstruction
@@ -984,7 +985,6 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
984
985
 
985
986
  # 8) Write out plot file
986
987
  if enable_plot:
987
- from matplotlib.backends.backend_pdf import PdfPages
988
988
  plot_name = base_name + '_metric_spenh.pdf'
989
989
 
990
990
  # Reshape feature to eliminate overlap redundancy for easier to understand spectrogram view
@@ -1015,12 +1015,15 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
1015
1015
  tfunc_name = tfunc_name + ' (db)'
1016
1016
 
1017
1017
  mixspec = 20 * np.log10(abs(mixture_f) + np.finfo(np.float32).eps)
1018
- pdf.savefig(plot_mixpred(mixture=mixture,
1019
- mixture_f=mixspec,
1020
- target=target,
1021
- feature=feat_sgram,
1022
- predict=predplot,
1023
- tp_title=tfunc_name))
1018
+ fig_obj = plot_mixpred(mixture=mixture,
1019
+ mixture_f=mixspec,
1020
+ target=target,
1021
+ feature=feat_sgram,
1022
+ predict=predplot,
1023
+ tp_title=tfunc_name)
1024
+ pdf.savefig(fig_obj)
1025
+ with mgzip.open(base_name + '_metric_spenh_fig1.mfigz', 'wb') as f:
1026
+ pickle.dump(fig_obj, f)
1024
1027
 
1025
1028
  # ----- page 2, plot unmapped predict, opt truth reconstructed and line plots of mean-over-f
1026
1029
  # pdf.savefig(plot_pdb_predtruth(predict=pred_snr_f, tp_title='predict snr_f (db)'))
@@ -1029,22 +1032,28 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
1029
1032
  tg_spec = 20 * np.log10(abs(target_f) + np.finfo(np.float32).eps)
1030
1033
  tg_est_spec = 20 * np.log10(abs(predict_complex) + np.finfo(np.float32).eps)
1031
1034
  # n_spec = np.reshape(n_spec,(n_spec.shape[0] * n_spec.shape[1], n_spec.shape[2]))
1032
- pdf.savefig(plot_e_predict_truth(predict=tg_est_spec,
1033
- predict_wav=target_est_wav,
1034
- truth_f=tg_spec,
1035
- truth_wav=target_fi,
1036
- metric=np.vstack((lerr_tg_frame, phd_frame)).T,
1037
- tp_title='speech estimate'))
1035
+ fig_obj = plot_e_predict_truth(predict=tg_est_spec,
1036
+ predict_wav=target_est_wav,
1037
+ truth_f=tg_spec,
1038
+ truth_wav=target_fi,
1039
+ metric=np.vstack((lerr_tg_frame, phd_frame)).T,
1040
+ tp_title='speech estimate')
1041
+ pdf.savefig(fig_obj)
1042
+ with mgzip.open(base_name + '_metric_spenh_fig2.mfigz', 'wb') as f:
1043
+ pickle.dump(fig_obj, f)
1038
1044
 
1039
1045
  # page 4 noise extraction
1040
1046
  n_spec = 20 * np.log10(abs(noise_f) + np.finfo(np.float32).eps)
1041
1047
  n_est_spec = 20 * np.log10(abs(noise_est_complex) + np.finfo(np.float32).eps)
1042
- pdf.savefig(plot_e_predict_truth(predict=n_est_spec,
1043
- predict_wav=noise_est_wav,
1044
- truth_f=n_spec,
1045
- truth_wav=noise_fi,
1046
- metric=lerr_n_frame,
1047
- tp_title='noise estimate'))
1048
+ fig_obj = plot_e_predict_truth(predict=n_est_spec,
1049
+ predict_wav=noise_est_wav,
1050
+ truth_f=n_spec,
1051
+ truth_wav=noise_fi,
1052
+ metric=lerr_n_frame,
1053
+ tp_title='noise estimate')
1054
+ pdf.savefig(fig_obj)
1055
+ with mgzip.open(base_name + '_metric_spenh_fig4.mfigz', 'wb') as f:
1056
+ pickle.dump(fig_obj, f)
1048
1057
 
1049
1058
  # Plot error waveforms
1050
1059
  # tg_err_wav = target_fi - target_est_wav
@@ -1072,6 +1081,7 @@ def main():
1072
1081
  enable_wav = args['--wav']
1073
1082
  enable_summary = args['--summary']
1074
1083
  predict_location = args['PLOC']
1084
+ num_proc = args['--num_process']
1075
1085
  truth_location = args['TLOC']
1076
1086
 
1077
1087
  import glob
@@ -1080,6 +1090,7 @@ def main():
1080
1090
  from os.path import join
1081
1091
  from os.path import split
1082
1092
 
1093
+ import psutil
1083
1094
  from tqdm import tqdm
1084
1095
 
1085
1096
  from sonusai import create_file_handler
@@ -1153,13 +1164,17 @@ def main():
1153
1164
  fnb = 'metric_spenh_fwhsp_' + asr_model_name + '_'
1154
1165
  logger.info(f'ASR enabled with method {asr_method} and whisper model {asr_model_name}')
1155
1166
  enable_asr_warmup = True
1167
+ elif asr_method == 'sensory':
1168
+ fnb = 'metric_spenh_snsr_' + asr_model_name + '_'
1169
+ logger.info(f'ASR enabled with method {asr_method} and model {asr_model_name}')
1170
+ enable_asr_warmup = True
1156
1171
  else:
1157
1172
  logger.error(f'Unrecognized ASR method: {asr_method}')
1158
1173
  return
1159
1174
 
1160
1175
  if enable_asr_warmup:
1161
- DEFAULT_SPEECH = split(DEFAULT_NOISE)[0] + '/speech_ma01_01.wav'
1162
- audio = read_audio(DEFAULT_SPEECH)
1176
+ default_speech = split(DEFAULT_NOISE)[0] + '/speech_ma01_01.wav'
1177
+ audio = read_audio(default_speech)
1163
1178
  logger.info(f'Warming up asr method, note for cloud service this could take up to a few min ...')
1164
1179
  asr_chk = calc_asr(audio, engine=asr_method, whisper_model_name=asr_model_name)
1165
1180
  logger.info(f'Warmup completed, results {asr_chk}')
@@ -1173,10 +1188,25 @@ def main():
1173
1188
  MP_GLOBAL.asr_method = asr_method
1174
1189
  MP_GLOBAL.asr_model_name = asr_model_name
1175
1190
 
1191
+ num_cpu = psutil.cpu_count()
1192
+ cpu_percent = psutil.cpu_percent(interval=1)
1193
+ logger.info(f"#CPUs: {num_cpu}, current CPU utilization: {cpu_percent}%")
1194
+ logger.info(f"Memory utilization: {psutil.virtual_memory().percent}%")
1195
+ if num_proc == 'auto':
1196
+ use_cpu = int(num_cpu * (0.9 - cpu_percent / 100)) # default use 80% of available cpus
1197
+ elif num_proc == 'None':
1198
+ use_cpu = None
1199
+ else:
1200
+ use_cpu = min(max(int(num_proc), 1), num_cpu)
1201
+
1176
1202
  # Individual mixtures use pandas print, set precision to 2 decimal places
1177
1203
  # pd.set_option('float_format', '{:.2f}'.format)
1204
+ logger.info(f"Calculating metrics for {len(mixids)} mixtures using {use_cpu} parallel processes ...")
1178
1205
  progress = tqdm(total=len(mixids), desc='calc_metric_spenh')
1179
- all_metrics_tables = pp_tqdm_imap(_process_mixture, mixids, progress=progress, num_cpus=8)
1206
+ if use_cpu is None:
1207
+ all_metrics_tables = pp_tqdm_imap(_process_mixture, mixids, progress=progress, no_par=True)
1208
+ else:
1209
+ all_metrics_tables = pp_tqdm_imap(_process_mixture, mixids, progress=progress, num_cpus=use_cpu)
1180
1210
  progress.close()
1181
1211
 
1182
1212
  all_metrics_table_1 = pd.concat([item[0] for item in all_metrics_tables])
@@ -1209,9 +1239,9 @@ def main():
1209
1239
  for i in range(len(mtab_snr_summary)):
1210
1240
  if mtab_snr_summary['MXWER'].iloc[i] == 0.0:
1211
1241
  if mtab_snr_summary['WER'].iloc[i] == 0.0:
1212
- mtab_snr_summary['WERi%'].iloc[i] = 0.0
1242
+ mtab_snr_summary.iloc[i, 6] = 0.0 # mtab_snr_summary['WERi%'].iloc[i] = 0.0
1213
1243
  else:
1214
- mtab_snr_summary['WERi%'].iloc[i] = -999.0
1244
+ mtab_snr_summary.iloc[i, 6] = -999.0 # mtab_snr_summary['WERi%'].iloc[i] = -999.0
1215
1245
  else:
1216
1246
  if ~np.isnan(mtab_snr_summary['WER'].iloc[i]) and ~np.isnan(mtab_snr_summary['MXWER'].iloc[i]):
1217
1247
  # update WERi% in 6th col
@@ -1240,7 +1270,6 @@ def main():
1240
1270
  if num_mix > 1:
1241
1271
  # Print pandas data to files using precision to 2 decimals
1242
1272
  # pd.set_option('float_format', '{:.2f}'.format)
1243
- csp = 0
1244
1273
 
1245
1274
  if not truth_est_mode:
1246
1275
  ofname = join(predict_location, fnb + 'summary.txt')
@@ -1280,9 +1309,9 @@ def main():
1280
1309
 
1281
1310
  # Write summary to .csv file
1282
1311
  if not truth_est_mode:
1283
- csv_name = join(predict_location, fnb + 'summary.csv')
1312
+ csv_name = str(join(predict_location, fnb + 'summary.csv'))
1284
1313
  else:
1285
- csv_name = join(predict_location, fnb + 'summary_truest.csv')
1314
+ csv_name = str(join(predict_location, fnb + 'truest_summary.csv'))
1286
1315
  header_args = {
1287
1316
  'mode': 'a',
1288
1317
  'encoding': 'utf-8',
@@ -1315,16 +1344,16 @@ def main():
1315
1344
  pd.DataFrame([label]).to_csv(csv_name, **header_args)
1316
1345
 
1317
1346
  if not truth_est_mode:
1318
- csv_name = join(predict_location, fnb + 'list.csv')
1347
+ csv_name = str(join(predict_location, fnb + 'list.csv'))
1319
1348
  else:
1320
- csv_name = join(predict_location, fnb + 'list_truest.csv')
1349
+ csv_name = str(join(predict_location, fnb + 'truest_list.csv'))
1321
1350
  pd.DataFrame(['Speech enhancement metrics list:']).to_csv(csv_name, header=False, index=False) # open as write
1322
1351
  all_metrics_table_1.round(2).to_csv(csv_name, **table_args)
1323
1352
 
1324
1353
  if not truth_est_mode:
1325
- csv_name = join(predict_location, fnb + 'estats_list.csv')
1354
+ csv_name = str(join(predict_location, fnb + 'estats_list.csv'))
1326
1355
  else:
1327
- csv_name = join(predict_location, fnb + 'estats_list_truest.csv')
1356
+ csv_name = str(join(predict_location, fnb + 'truest_estats_list.csv'))
1328
1357
  pd.DataFrame(['Extraction statistics list:']).to_csv(csv_name, header=False, index=False) # open as write
1329
1358
  all_metrics_table_2.round(2).to_csv(csv_name, **table_args)
1330
1359
 
@@ -255,30 +255,6 @@ The 'truth_settings' parameter specifies the following:
255
255
  """ + get_truth_functions() + default
256
256
 
257
257
 
258
- def doc_asr_manifest() -> str:
259
- default = f"\nDefault value: {get_default_config()['asr_manifest']}"
260
- return """
261
- 'asr_manifest' is a mixture database configuration parameter that defines an
262
- optional ASR manifest.
263
-
264
- The parameter takes a list of manifest files to be used to populate ASR data
265
- per target. Each line of the manifest should be in the following format:
266
-
267
- {"audio_filepath": "/path/to/audio.wav", "text": "the transcription of the utterance", "duration": 23.147}
268
-
269
- The audio_filepath field should provide an absolute path to the audio file corresponding
270
- to the utterance. The text field should contain the full transcript for the utterance,
271
- and the duration field should reflect the duration of the utterance in seconds.
272
-
273
- Each entry in the manifest (describing one audio file) should be bordered by '{' and '}'
274
- and must be contained on one line. The fields that describe the file should be separated
275
- by commas, and have the form "field_name": value, as shown above.
276
-
277
- Since the manifest specifies the path for each utterance, the audio files do not have to be
278
- located in the same directory as the manifest, or even in any specific directory structure.
279
- """ + default
280
-
281
-
282
258
  def doc_augmentations() -> str:
283
259
  return """
284
260
  Augmentation Rules
@@ -0,0 +1,146 @@
1
+ # Generate mixdb metrics based on metrics listed in config.yml
2
+
3
+
4
+ class MixtureMetrics:
5
+ @property
6
+ def mxsnr(self):
7
+ ...
8
+
9
+ @property
10
+ def mxssnravg(self):
11
+ ...
12
+
13
+ @property
14
+ def mxssnrstd(self):
15
+ ...
16
+
17
+ @property
18
+ def mxssnrdavg(self):
19
+ ...
20
+
21
+ @property
22
+ def mxssnrdstd(self):
23
+ ...
24
+
25
+ @property
26
+ def mxpesq(self):
27
+ ...
28
+
29
+ @property
30
+ def mxwsdr(self):
31
+ ...
32
+
33
+ @property
34
+ def mxpd(self):
35
+ ...
36
+
37
+ @property
38
+ def mxstoi(self):
39
+ ...
40
+
41
+ @property
42
+ def mxcsig(self):
43
+ ...
44
+
45
+ @property
46
+ def mxcbak(self):
47
+ ...
48
+
49
+ @property
50
+ def mxcovl(self):
51
+ ...
52
+
53
+ def mxwer(self, engine: str, model: str):
54
+ ...
55
+
56
+ @property
57
+ def tdco(self):
58
+ ...
59
+
60
+ @property
61
+ def tmin(self):
62
+ ...
63
+
64
+ @property
65
+ def tmax(self):
66
+ ...
67
+
68
+ @property
69
+ def tpkdb(self):
70
+ ...
71
+
72
+ @property
73
+ def tlrms(self):
74
+ ...
75
+
76
+ @property
77
+ def tpkr(self):
78
+ ...
79
+
80
+ @property
81
+ def ttr(self):
82
+ ...
83
+
84
+ @property
85
+ def tcr(self):
86
+ ...
87
+
88
+ @property
89
+ def tfl(self):
90
+ ...
91
+
92
+ @property
93
+ def tpkc(self):
94
+ ...
95
+
96
+ @property
97
+ def ndco(self):
98
+ ...
99
+
100
+ @property
101
+ def nmin(self):
102
+ ...
103
+
104
+ @property
105
+ def nmax(self):
106
+ ...
107
+
108
+ @property
109
+ def npkdb(self):
110
+ ...
111
+
112
+ @property
113
+ def nlrms(self):
114
+ ...
115
+
116
+ @property
117
+ def npkr(self):
118
+ ...
119
+
120
+ @property
121
+ def ntr(self):
122
+ ...
123
+
124
+ @property
125
+ def ncr(self):
126
+ ...
127
+
128
+ @property
129
+ def nfl(self):
130
+ ...
131
+
132
+ @property
133
+ def npkc(self):
134
+ ...
135
+
136
+ @property
137
+ def sedavg(self):
138
+ ...
139
+
140
+ @property
141
+ def sedcnt(self):
142
+ ...
143
+
144
+ @property
145
+ def sedtopn(self):
146
+ ...
@@ -174,7 +174,6 @@ def genmixdb(location: str,
174
174
  from sonusai.mixture import initialize_db
175
175
  from sonusai.mixture import load_config
176
176
  from sonusai.mixture import log_duration_and_sizes
177
- from sonusai.mixture import populate_asr_manifest_table
178
177
  from sonusai.mixture import populate_class_label_table
179
178
  from sonusai.mixture import populate_class_weights_threshold_table
180
179
  from sonusai.mixture import populate_impulse_response_file_table
@@ -195,7 +194,6 @@ def genmixdb(location: str,
195
194
  mixdb = MixtureDatabase(location=location, test=test)
196
195
 
197
196
  populate_top_table(location, config, test)
198
- populate_asr_manifest_table(location, config, test)
199
197
  populate_class_label_table(location, config, test)
200
198
  populate_class_weights_threshold_table(location, config, test)
201
199
  populate_spectral_mask_table(location, config, test)
@@ -88,7 +88,6 @@ from .feature import get_feature_from_audio
88
88
  from .generation import generate_mixtures
89
89
  from .generation import get_all_snrs_from_config
90
90
  from .generation import initialize_db
91
- from .generation import populate_asr_manifest_table
92
91
  from .generation import populate_class_label_table
93
92
  from .generation import populate_class_weights_threshold_table
94
93
  from .generation import populate_impulse_response_file_table
@@ -4,7 +4,6 @@ from importlib.resources import as_file
4
4
  from importlib.resources import files
5
5
 
6
6
  REQUIRED_CONFIGS = [
7
- 'asr_manifest',
8
7
  'class_balancing',
9
8
  'class_balancing_augmentation',
10
9
  'class_labels',
@@ -1,5 +1,4 @@
1
1
  from dataclasses import dataclass
2
- from dataclasses import field
3
2
  from typing import Optional
4
3
  from typing import TypeAlias
5
4
 
@@ -135,13 +134,7 @@ class UniversalSNR(float):
135
134
  return self._is_random
136
135
 
137
136
 
138
- # @dataclass(frozen=True)
139
- # class UniversalSNR:
140
- # is_random: bool
141
- # value: float
142
- #
143
- # def __lt__(self, other) -> bool:
144
- # return self.value < other.value
137
+ Speaker: TypeAlias = dict[str, str]
145
138
 
146
139
 
147
140
  @dataclass
@@ -151,6 +144,7 @@ class TargetFile(DataClassSonusAIMixin):
151
144
  truth_settings: TruthSettings
152
145
  class_balancing_augmentation: Optional[AugmentationRule] = None
153
146
  level_type: Optional[str] = None
147
+ speaker_id: Optional[int] = None
154
148
 
155
149
  @property
156
150
  def duration(self) -> float:
@@ -317,7 +311,6 @@ class FeatureGeneratorInfo:
317
311
 
318
312
  @dataclass
319
313
  class MixtureDatabaseConfig(DataClassSonusAIMixin):
320
- asr_manifest: list[str] = field(default_factory=list)
321
314
  class_balancing: Optional[bool] = False
322
315
  class_labels: Optional[list[str]] = None
323
316
  class_weights_threshold: Optional[list[float]] = None