sonusai 0.17.0__py3-none-any.whl → 0.17.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. sonusai/audiofe.py +22 -51
  2. sonusai/calc_metric_spenh.py +206 -213
  3. sonusai/doc/doc.py +1 -1
  4. sonusai/mixture/__init__.py +2 -0
  5. sonusai/mixture/audio.py +12 -0
  6. sonusai/mixture/datatypes.py +11 -3
  7. sonusai/mixture/mixdb.py +101 -0
  8. sonusai/mixture/soundfile_audio.py +39 -0
  9. sonusai/mixture/speaker_metadata.py +35 -0
  10. sonusai/mixture/torchaudio_audio.py +22 -0
  11. sonusai/mkmanifest.py +1 -1
  12. sonusai/onnx_predict.py +114 -410
  13. sonusai/queries/queries.py +1 -1
  14. sonusai/speech/__init__.py +3 -0
  15. sonusai/speech/l2arctic.py +116 -0
  16. sonusai/speech/librispeech.py +99 -0
  17. sonusai/speech/mcgill.py +70 -0
  18. sonusai/speech/textgrid.py +100 -0
  19. sonusai/speech/timit.py +135 -0
  20. sonusai/speech/types.py +12 -0
  21. sonusai/speech/vctk.py +52 -0
  22. sonusai/speech/voxceleb2.py +86 -0
  23. sonusai/utils/__init__.py +2 -1
  24. sonusai/utils/asr_manifest_functions/__init__.py +0 -1
  25. sonusai/utils/asr_manifest_functions/data.py +0 -8
  26. sonusai/utils/asr_manifest_functions/librispeech.py +1 -1
  27. sonusai/utils/asr_manifest_functions/mcgill_speech.py +1 -1
  28. sonusai/utils/asr_manifest_functions/vctk_noisy_speech.py +1 -1
  29. sonusai/utils/braced_glob.py +7 -3
  30. sonusai/utils/onnx_utils.py +110 -106
  31. sonusai/utils/path_info.py +7 -0
  32. {sonusai-0.17.0.dist-info → sonusai-0.17.2.dist-info}/METADATA +2 -1
  33. {sonusai-0.17.0.dist-info → sonusai-0.17.2.dist-info}/RECORD +35 -30
  34. {sonusai-0.17.0.dist-info → sonusai-0.17.2.dist-info}/WHEEL +1 -1
  35. sonusai/calc_metric_spenh-save.py +0 -1334
  36. sonusai/onnx_predict-old.py +0 -240
  37. sonusai/onnx_predict-save.py +0 -487
  38. sonusai/ovino_predict.py +0 -508
  39. sonusai/ovino_query_devices.py +0 -47
  40. sonusai/torchl_onnx-old.py +0 -216
  41. {sonusai-0.17.0.dist-info → sonusai-0.17.2.dist-info}/entry_points.txt +0 -0
sonusai/doc/doc.py CHANGED
@@ -40,7 +40,7 @@ Required field:
40
40
  'name'
41
41
  File name. May be one of the following:
42
42
 
43
- audio Supported formats are .wav, .mp3, .aif, .flac, and .ogg
43
+ audio Supported formats are .wav, .mp3, .m4a, .aif, .flac, and .ogg
44
44
  glob Matches file glob patterns
45
45
  .yml The given YAML file is parsed into the list
46
46
  .txt Each line in the given text file indicates an item which
@@ -2,6 +2,7 @@
2
2
  from .audio import get_duration
3
3
  from .audio import get_next_noise
4
4
  from .audio import get_num_samples
5
+ from .audio import get_sample_rate
5
6
  from .audio import read_audio
6
7
  from .audio import read_ir
7
8
  from .audio import validate_input_file
@@ -73,6 +74,7 @@ from .datatypes import Predict
73
74
  from .datatypes import Segsnr
74
75
  from .datatypes import SpectralMask
75
76
  from .datatypes import SpectralMasks
77
+ from .datatypes import SpeechMetadata
76
78
  from .datatypes import TargetFile
77
79
  from .datatypes import TargetFiles
78
80
  from .datatypes import TransformConfig
sonusai/mixture/audio.py CHANGED
@@ -45,6 +45,18 @@ def validate_input_file(input_filepath: str) -> None:
45
45
  raise SonusAIError(f'This installation cannot process .{ext} files')
46
46
 
47
47
 
48
+ @lru_cache
49
+ def get_sample_rate(name: str) -> int:
50
+ """Get sample rate from audio file
51
+
52
+ :param name: File name
53
+ :return: Sample rate
54
+ """
55
+ from .soundfile_audio import get_sample_rate
56
+
57
+ return get_sample_rate(name)
58
+
59
+
48
60
  @lru_cache
49
61
  def read_audio(name: str) -> AudioT:
50
62
  """Read audio data from a file
@@ -6,6 +6,7 @@ from typing import TypeAlias
6
6
  import numpy as np
7
7
  import numpy.typing as npt
8
8
  from dataclasses_json import DataClassJsonMixin
9
+ from praatio.utilities.constants import Interval
9
10
 
10
11
  AudioT: TypeAlias = npt.NDArray[np.float32]
11
12
  AudiosT: TypeAlias = list[AudioT]
@@ -249,7 +250,7 @@ class Target(DataClassSonusAIMixin):
249
250
  gain: Optional[float] = None
250
251
 
251
252
 
252
- Targets = list[Target]
253
+ Targets: TypeAlias = list[Target]
253
254
 
254
255
 
255
256
  @dataclass
@@ -276,11 +277,15 @@ class Mixture(DataClassSonusAIMixin):
276
277
  return self.noise.file_id
277
278
 
278
279
  @property
279
- def target_id(self) -> list[int]:
280
+ def target_ids(self) -> list[int]:
280
281
  return [target.file_id for target in self.targets]
281
282
 
283
+ @property
284
+ def target_augmentations(self) -> list[Augmentation]:
285
+ return [target.augmentation for target in self.targets]
286
+
282
287
 
283
- Mixtures = list[Mixture]
288
+ Mixtures: TypeAlias = list[Mixture]
284
289
 
285
290
 
286
291
  @dataclass(frozen=True)
@@ -326,3 +331,6 @@ class MixtureDatabaseConfig(DataClassSonusAIMixin):
326
331
  target_files: Optional[TargetFiles] = None
327
332
  truth_mutex: Optional[bool] = None
328
333
  truth_reduction_function: Optional[str] = None
334
+
335
+
336
+ SpeechMetadata: TypeAlias = str | list[Interval] | None
sonusai/mixture/mixdb.py CHANGED
@@ -1,11 +1,16 @@
1
1
  from functools import cached_property
2
2
  from functools import lru_cache
3
3
  from functools import partial
4
+ from pathlib import Path
4
5
  from sqlite3 import Connection
5
6
  from sqlite3 import Cursor
6
7
  from typing import Any
8
+ from typing import Callable
7
9
  from typing import Optional
8
10
 
11
+ from praatio import textgrid
12
+ from praatio.utilities.constants import Interval
13
+
9
14
  from sonusai.mixture.datatypes import AudioF
10
15
  from sonusai.mixture.datatypes import AudioT
11
16
  from sonusai.mixture.datatypes import AudiosF
@@ -23,11 +28,13 @@ from sonusai.mixture.datatypes import NoiseFiles
23
28
  from sonusai.mixture.datatypes import Segsnr
24
29
  from sonusai.mixture.datatypes import SpectralMask
25
30
  from sonusai.mixture.datatypes import SpectralMasks
31
+ from sonusai.mixture.datatypes import SpeechMetadata
26
32
  from sonusai.mixture.datatypes import TargetFile
27
33
  from sonusai.mixture.datatypes import TargetFiles
28
34
  from sonusai.mixture.datatypes import TransformConfig
29
35
  from sonusai.mixture.datatypes import Truth
30
36
  from sonusai.mixture.datatypes import UniversalSNR
37
+ from sonusai.mixture.tokenized_shell_vars import tokenized_expand
31
38
 
32
39
 
33
40
  def db_file(location: str, test: bool = False) -> str:
@@ -81,6 +88,7 @@ class MixtureDatabase:
81
88
  def __init__(self, location: str, test: bool = False) -> None:
82
89
  self.location = location
83
90
  self.db = partial(SQLiteContextManager, self.location, test)
91
+ self._speaker_metadata_tiers: list[str] = []
84
92
 
85
93
  @cached_property
86
94
  def json(self) -> str:
@@ -1069,6 +1077,99 @@ class MixtureDatabase:
1069
1077
 
1070
1078
  return class_count
1071
1079
 
1080
+ @cached_property
1081
+ def _speech_metadata(self) -> dict[str, dict[str, SpeechMetadata]]:
1082
+ """Speech metadata is a nested dictionary.
1083
+
1084
+ data['target_file_name'] = { 'tier': SpeechMetadata, ... }
1085
+ """
1086
+ data: dict[str, dict[str, SpeechMetadata]] = {}
1087
+ for file in self.target_files:
1088
+ data[file.name] = {}
1089
+ file_name, _ = tokenized_expand(file.name)
1090
+ tg_file = Path(file_name).with_suffix('.TextGrid')
1091
+ if tg_file.exists():
1092
+ tg = textgrid.openTextgrid(str(tg_file), includeEmptyIntervals=False)
1093
+ for tier in tg.tierNames:
1094
+ entries = tg.getTier(tier).entries
1095
+ if len(entries) > 1:
1096
+ data[file.name][tier] = entries
1097
+ else:
1098
+ data[file.name][tier] = entries[0].label
1099
+
1100
+ return data
1101
+
1102
+ @cached_property
1103
+ def speech_metadata_tiers(self) -> list[str]:
1104
+ return sorted(list(set([key for value in self._speech_metadata.values() for key in value.keys()])))
1105
+
1106
+ def speech_metadata_all(self, tier: str) -> list[SpeechMetadata]:
1107
+ results = sorted(
1108
+ set([value.get(tier) for value in self._speech_metadata.values() if isinstance(value.get(tier), str)]))
1109
+ return results
1110
+
1111
+ def mixids_for_speech_metadata(self,
1112
+ tier: str,
1113
+ value: str,
1114
+ predicate: Callable[[str], bool] = None) -> list[int]:
1115
+ """Get a list of mixids for the given speech metadata tier.
1116
+
1117
+ If 'predicate' is None, then include mixids whose tier values are equal to the given 'value'. If 'predicate' is
1118
+ not None, then ignore 'value' and use the given callable to determine which entries to include.
1119
+
1120
+ Examples:
1121
+
1122
+ >>> mixids = mixdb.mixids_for_speech_metadata('speaker_id', 'TIMIT_ARC0')
1123
+ Get mixids for mixtures with speakers whose speaker_ids are 'TIMIT_ARC0'.
1124
+
1125
+ >>> mixids = mixdb.mixids_for_speech_metadata('age', '', lambda x: int(x) < 25)
1126
+ Get mixids for mixtures with speakers whose ages are less than 25.
1127
+
1128
+ >>> mixids = mixdb.mixids_for_speech_metadata('dialect', '', lambda x: x in ['New York City', 'Northern'])
1129
+ Get mixids for mixtures with speakers whose dialects are either 'New York City' or 'Northern'.
1130
+ """
1131
+ if predicate is None:
1132
+ def predicate(x: str) -> bool:
1133
+ return x == value
1134
+
1135
+ # First get list of matching target files
1136
+ target_files = [k for k, v in self._speech_metadata.items() if
1137
+ isinstance(v.get(tier), str) and predicate(str(v.get(tier)))]
1138
+
1139
+ # Next get list of mixids that contain those target files
1140
+ mixids: list[int] = []
1141
+ for mixid in self.mixids_to_list():
1142
+ mixid_target_files = [self.target_file(target.file_id).name for target in self.mixture(mixid).targets]
1143
+ for mixid_target_file in mixid_target_files:
1144
+ if mixid_target_file in target_files:
1145
+ mixids.append(mixid)
1146
+
1147
+ # Return sorted, unique list of mixids
1148
+ return sorted(list(set(mixids)))
1149
+
1150
+ def get_speech_metadata(self, mixid: int, tier: str) -> list[SpeechMetadata]:
1151
+ results: list[SpeechMetadata] = []
1152
+ for target in self.mixture(mixid).targets:
1153
+ data = self._speech_metadata[self.target_file(target.file_id).name].get(tier)
1154
+
1155
+ if data is None:
1156
+ results.append(None)
1157
+ elif isinstance(data, list):
1158
+ # Check for tempo augmentation and adjust Interval start and end data as needed
1159
+ entries = []
1160
+ for entry in data:
1161
+ if target.augmentation.tempo is not None:
1162
+ entries.append(Interval(entry.start / target.augmentation.tempo,
1163
+ entry.end / target.augmentation.tempo,
1164
+ entry.label))
1165
+ else:
1166
+ entries.append(entry)
1167
+
1168
+ else:
1169
+ results.append(data)
1170
+
1171
+ return results
1172
+
1072
1173
 
1073
1174
  @lru_cache
1074
1175
  def _spectral_mask(db: partial, sm_id: int) -> SpectralMask:
@@ -18,6 +18,11 @@ def _raw_read(name: str) -> tuple[AudioT, int]:
18
18
  raw = np.array(sound.get_array_of_samples()).astype(np.float32).reshape((-1, sound.channels))
19
19
  raw = raw / 2 ** (sound.sample_width * 8 - 1)
20
20
  sample_rate = sound.frame_rate
21
+ elif expanded_name.endswith('.m4a'):
22
+ sound = AudioSegment.from_file(expanded_name)
23
+ raw = np.array(sound.get_array_of_samples()).astype(np.float32).reshape((-1, sound.channels))
24
+ raw = raw / 2 ** (sound.sample_width * 8 - 1)
25
+ sample_rate = sound.frame_rate
21
26
  else:
22
27
  raw, sample_rate = soundfile.read(expanded_name, always_2d=True, dtype='float32')
23
28
  except Exception as e:
@@ -29,6 +34,35 @@ def _raw_read(name: str) -> tuple[AudioT, int]:
29
34
  return np.squeeze(raw[:, 0]), sample_rate
30
35
 
31
36
 
37
+ def get_sample_rate(name: str) -> int:
38
+ """Get sample rate from audio file using soundfile
39
+
40
+ :param name: File name
41
+ :return: Sample rate
42
+ """
43
+ import soundfile
44
+ from pydub import AudioSegment
45
+
46
+ from sonusai import SonusAIError
47
+ from .tokenized_shell_vars import tokenized_expand
48
+
49
+ expanded_name, _ = tokenized_expand(name)
50
+
51
+ try:
52
+ if expanded_name.endswith('.mp3'):
53
+ return AudioSegment.from_mp3(expanded_name).frame_rate
54
+
55
+ if expanded_name.endswith('.m4a'):
56
+ return AudioSegment.from_file(expanded_name).frame_rate
57
+
58
+ return soundfile.info(expanded_name).samplerate
59
+ except Exception as e:
60
+ if name != expanded_name:
61
+ raise SonusAIError(f'Error reading {name} (expanded: {expanded_name}): {e}')
62
+ else:
63
+ raise SonusAIError(f'Error reading {name}: {e}')
64
+
65
+
32
66
  def read_ir(name: str) -> ImpulseResponseData:
33
67
  """Read impulse response data using soundfile
34
68
 
@@ -87,8 +121,13 @@ def get_num_samples(name: str) -> int:
87
121
  sound = AudioSegment.from_mp3(expanded_name)
88
122
  samples = sound.frame_count()
89
123
  sample_rate = sound.frame_rate
124
+ elif expanded_name.endswith('.m4a'):
125
+ sound = AudioSegment.from_file(expanded_name)
126
+ samples = sound.frame_count()
127
+ sample_rate = sound.frame_rate
90
128
  else:
91
129
  info = soundfile.info(name)
92
130
  samples = info.frames
93
131
  sample_rate = info.samplerate
132
+
94
133
  return math.ceil(SAMPLE_RATE * samples / sample_rate)
@@ -0,0 +1,35 @@
1
+ from functools import cached_property
2
+ from pathlib import Path
3
+
4
+ from praatio import textgrid
5
+ from praatio.data_classes.textgrid_tier import TextgridTier
6
+ from praatio.utilities.constants import Interval
7
+
8
+ from sonusai.mixture.datatypes import TargetFiles
9
+ from sonusai.mixture.tokenized_shell_vars import tokenized_expand
10
+
11
+
12
+ class SpeakerMetadata:
13
+ def __init__(self, target_files: TargetFiles) -> None:
14
+ self.data: dict[str, dict[str, TextgridTier]] = {}
15
+ for file in target_files:
16
+ self.data[file.name] = {}
17
+ file_name, _ = tokenized_expand(file.name)
18
+ tg_file = Path(file_name).with_suffix('.TextGrid')
19
+ if tg_file.exists():
20
+ tg = textgrid.openTextgrid(str(tg_file), includeEmptyIntervals=False)
21
+ for tier in tg.tierNames:
22
+ self.data[file.name][tier] = tg.getTier(tier)
23
+
24
+ @cached_property
25
+ def tiers(self) -> list[str]:
26
+ return sorted(list(set([key for value in self.data.values() for key in value.keys()])))
27
+
28
+ def all(self, tier: str, label_only: bool = False) -> list[Interval]:
29
+ results = [value[tier].entries for value in self.data.values()]
30
+ if label_only:
31
+ return sorted(set([r.label for result in results for r in result]))
32
+ return results
33
+
34
+ def mixids_for(self, tier: str, value: str) -> list[int]:
35
+ pass
@@ -39,6 +39,28 @@ def read_impulse_response(name: str) -> ImpulseResponseData:
39
39
  return ImpulseResponseData(name=name, sample_rate=sample_rate, data=data)
40
40
 
41
41
 
42
+ def get_sample_rate(name: str) -> int:
43
+ """Get sample rate from audio file using torchaudio
44
+
45
+ :param name: File name
46
+ :return: Sample rate
47
+ """
48
+ import torchaudio
49
+
50
+ from sonusai import SonusAIError
51
+ from .tokenized_shell_vars import tokenized_expand
52
+
53
+ expanded_name, _ = tokenized_expand(name)
54
+
55
+ try:
56
+ return torchaudio.info(expanded_name).sample_rate
57
+ except Exception as e:
58
+ if name != expanded_name:
59
+ raise SonusAIError(f'Error reading {name} (expanded: {expanded_name}):\n{e}')
60
+ else:
61
+ raise SonusAIError(f'Error reading {name}:\n{e}')
62
+
63
+
42
64
  def read_audio(name: str) -> AudioT:
43
65
  """Read audio data from a file using torchaudio
44
66
 
sonusai/mkmanifest.py CHANGED
@@ -94,10 +94,10 @@ def main() -> None:
94
94
  from sonusai import initial_log_messages
95
95
  from sonusai import logger
96
96
  from sonusai import update_console_handler
97
+ from sonusai.utils import PathInfo
97
98
  from sonusai.utils import braced_iglob
98
99
  from sonusai.utils import pp_tqdm_imap
99
100
  from sonusai.utils import seconds_to_hms
100
- from sonusai.utils.asr_manifest_functions import PathInfo
101
101
  from sonusai.utils.asr_manifest_functions import collect_librispeech_transcripts
102
102
  from sonusai.utils.asr_manifest_functions import collect_vctk_noisy_speech_transcripts
103
103
  from sonusai.utils.asr_manifest_functions import get_librispeech_manifest_entry