sonusai 0.19.6__py3-none-any.whl → 0.19.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonusai/__init__.py +1 -1
- sonusai/aawscd_probwrite.py +1 -1
- sonusai/calc_metric_spenh.py +1 -1
- sonusai/genft.py +29 -14
- sonusai/genmetrics.py +60 -42
- sonusai/genmix.py +41 -29
- sonusai/genmixdb.py +54 -62
- sonusai/metrics/calc_class_weights.py +1 -3
- sonusai/metrics/calc_optimal_thresholds.py +2 -2
- sonusai/metrics/calc_phase_distance.py +1 -1
- sonusai/metrics/calc_speech.py +6 -6
- sonusai/metrics/class_summary.py +6 -15
- sonusai/metrics/confusion_matrix_summary.py +11 -27
- sonusai/metrics/one_hot.py +3 -3
- sonusai/metrics/snr_summary.py +7 -7
- sonusai/mixture/__init__.py +2 -17
- sonusai/mixture/augmentation.py +5 -6
- sonusai/mixture/class_count.py +1 -1
- sonusai/mixture/config.py +36 -46
- sonusai/mixture/data_io.py +30 -1
- sonusai/mixture/datatypes.py +29 -40
- sonusai/mixture/db_datatypes.py +1 -1
- sonusai/mixture/feature.py +3 -23
- sonusai/mixture/generation.py +202 -235
- sonusai/mixture/helpers.py +29 -187
- sonusai/mixture/mixdb.py +386 -159
- sonusai/mixture/soundfile_audio.py +1 -1
- sonusai/mixture/sox_audio.py +4 -4
- sonusai/mixture/sox_augmentation.py +1 -1
- sonusai/mixture/target_class_balancing.py +9 -11
- sonusai/mixture/targets.py +23 -20
- sonusai/mixture/truth.py +21 -34
- sonusai/mixture/truth_functions/__init__.py +6 -0
- sonusai/mixture/truth_functions/crm.py +51 -37
- sonusai/mixture/truth_functions/energy.py +95 -50
- sonusai/mixture/truth_functions/file.py +12 -8
- sonusai/mixture/truth_functions/metadata.py +24 -0
- sonusai/mixture/truth_functions/metrics.py +28 -0
- sonusai/mixture/truth_functions/phoneme.py +4 -5
- sonusai/mixture/truth_functions/sed.py +32 -23
- sonusai/mixture/truth_functions/target.py +62 -29
- sonusai/mkwav.py +20 -19
- sonusai/queries/queries.py +9 -15
- sonusai/speech/l2arctic.py +6 -2
- sonusai/summarize_metric_spenh.py +1 -1
- sonusai/utils/__init__.py +1 -0
- sonusai/utils/asr_functions/aaware_whisper.py +1 -1
- sonusai/utils/audio_devices.py +27 -18
- sonusai/utils/docstring.py +6 -3
- sonusai/utils/energy_f.py +5 -3
- sonusai/utils/human_readable_size.py +6 -6
- sonusai/utils/load_object.py +15 -0
- sonusai/utils/onnx_utils.py +2 -2
- sonusai/utils/print_mixture_details.py +3 -3
- {sonusai-0.19.6.dist-info → sonusai-0.19.8.dist-info}/METADATA +2 -2
- {sonusai-0.19.6.dist-info → sonusai-0.19.8.dist-info}/RECORD +58 -56
- sonusai/mixture/truth_functions/datatypes.py +0 -37
- {sonusai-0.19.6.dist-info → sonusai-0.19.8.dist-info}/WHEEL +0 -0
- {sonusai-0.19.6.dist-info → sonusai-0.19.8.dist-info}/entry_points.txt +0 -0
sonusai/mixture/datatypes.py
CHANGED
@@ -12,16 +12,12 @@ from dataclasses_json import DataClassJsonMixin
|
|
12
12
|
from praatio.utilities.constants import Interval
|
13
13
|
|
14
14
|
AudioT: TypeAlias = npt.NDArray[np.float32]
|
15
|
-
AudiosT: TypeAlias = list[AudioT]
|
16
15
|
|
17
|
-
|
18
|
-
|
19
|
-
Truth: TypeAlias = npt.NDArray[np.float32]
|
16
|
+
Truth: TypeAlias = Any
|
20
17
|
TruthDict: TypeAlias = dict[str, Truth]
|
21
18
|
Segsnr: TypeAlias = npt.NDArray[np.float32]
|
22
19
|
|
23
20
|
AudioF: TypeAlias = npt.NDArray[np.complex64]
|
24
|
-
AudiosF: TypeAlias = list[AudioF]
|
25
21
|
|
26
22
|
EnergyT: TypeAlias = npt.NDArray[np.float32]
|
27
23
|
EnergyF: TypeAlias = npt.NDArray[np.float32]
|
@@ -92,9 +88,6 @@ class AugmentationRule(DataClassSonusAIMixin):
|
|
92
88
|
mixup: int = 1
|
93
89
|
|
94
90
|
|
95
|
-
AugmentationRules: TypeAlias = list[AugmentationRule]
|
96
|
-
|
97
|
-
|
98
91
|
@dataclass
|
99
92
|
class Augmentation(DataClassSonusAIMixin):
|
100
93
|
normalize: float | None = None
|
@@ -108,9 +101,6 @@ class Augmentation(DataClassSonusAIMixin):
|
|
108
101
|
ir: int | None = None
|
109
102
|
|
110
103
|
|
111
|
-
Augmentations: TypeAlias = list[Augmentation]
|
112
|
-
|
113
|
-
|
114
104
|
@dataclass(frozen=True)
|
115
105
|
class UniversalSNRGenerator:
|
116
106
|
is_random: bool
|
@@ -159,18 +149,12 @@ class TargetFile(DataClassSonusAIMixin):
|
|
159
149
|
return self.samples / SAMPLE_RATE
|
160
150
|
|
161
151
|
|
162
|
-
TargetFiles: TypeAlias = list[TargetFile]
|
163
|
-
|
164
|
-
|
165
152
|
@dataclass
|
166
153
|
class AugmentedTarget(DataClassSonusAIMixin):
|
167
154
|
target_id: int
|
168
155
|
target_augmentation_id: int
|
169
156
|
|
170
157
|
|
171
|
-
AugmentedTargets: TypeAlias = list[AugmentedTarget]
|
172
|
-
|
173
|
-
|
174
158
|
@dataclass
|
175
159
|
class NoiseFile(DataClassSonusAIMixin):
|
176
160
|
name: str
|
@@ -183,7 +167,6 @@ class NoiseFile(DataClassSonusAIMixin):
|
|
183
167
|
return self.samples / SAMPLE_RATE
|
184
168
|
|
185
169
|
|
186
|
-
NoiseFiles: TypeAlias = list[NoiseFile]
|
187
170
|
ClassCount: TypeAlias = list[int]
|
188
171
|
|
189
172
|
GeneralizedIDs: TypeAlias = str | int | list[int] | range
|
@@ -191,11 +174,11 @@ GeneralizedIDs: TypeAlias = str | int | list[int] | range
|
|
191
174
|
|
192
175
|
@dataclass
|
193
176
|
class GenMixData:
|
194
|
-
targets:
|
177
|
+
targets: list[AudioT] | None = None
|
195
178
|
target: AudioT | None = None
|
196
179
|
noise: AudioT | None = None
|
197
180
|
mixture: AudioT | None = None
|
198
|
-
truth_t: TruthDict | None = None
|
181
|
+
truth_t: list[TruthDict] | None = None
|
199
182
|
segsnr_t: Segsnr | None = None
|
200
183
|
|
201
184
|
|
@@ -223,9 +206,6 @@ class ImpulseResponseFile:
|
|
223
206
|
tags: list[str]
|
224
207
|
|
225
208
|
|
226
|
-
ImpulseResponseFiles: TypeAlias = list[ImpulseResponseFile]
|
227
|
-
|
228
|
-
|
229
209
|
@dataclass(frozen=True)
|
230
210
|
class SpectralMask(DataClassSonusAIMixin):
|
231
211
|
f_max_width: int
|
@@ -235,23 +215,24 @@ class SpectralMask(DataClassSonusAIMixin):
|
|
235
215
|
t_max_percent: int
|
236
216
|
|
237
217
|
|
238
|
-
SpectralMasks: TypeAlias = list[SpectralMask]
|
239
|
-
|
240
|
-
|
241
218
|
@dataclass(frozen=True)
|
242
219
|
class TruthParameter(DataClassSonusAIMixin):
|
243
220
|
name: str
|
244
|
-
parameters: int
|
245
|
-
|
246
|
-
|
247
|
-
TruthParameters: TypeAlias = list[TruthParameter]
|
221
|
+
parameters: int | None
|
248
222
|
|
249
223
|
|
250
224
|
@dataclass
|
251
225
|
class Target(DataClassSonusAIMixin):
|
252
226
|
file_id: int
|
253
227
|
augmentation: Augmentation
|
254
|
-
|
228
|
+
|
229
|
+
@property
|
230
|
+
def gain(self) -> float:
|
231
|
+
# gain is used to back out the gain augmentation in order to return the target audio
|
232
|
+
# to its normalized level when calculating truth (if needed).
|
233
|
+
if self.augmentation.gain is None:
|
234
|
+
return 1.0
|
235
|
+
return round(10 ** (self.augmentation.gain / 20), ndigits=5)
|
255
236
|
|
256
237
|
|
257
238
|
Targets: TypeAlias = list[Target]
|
@@ -261,14 +242,14 @@ Targets: TypeAlias = list[Target]
|
|
261
242
|
class Noise(DataClassSonusAIMixin):
|
262
243
|
file_id: int
|
263
244
|
augmentation: Augmentation
|
264
|
-
offset: int = 0
|
265
245
|
|
266
246
|
|
267
247
|
@dataclass
|
268
248
|
class Mixture(DataClassSonusAIMixin):
|
269
249
|
name: str
|
270
|
-
targets:
|
250
|
+
targets: list[Target]
|
271
251
|
noise: Noise
|
252
|
+
noise_offset: int
|
272
253
|
samples: int
|
273
254
|
snr: UniversalSNR
|
274
255
|
spectral_mask_id: int
|
@@ -288,8 +269,16 @@ class Mixture(DataClassSonusAIMixin):
|
|
288
269
|
def target_augmentations(self) -> list[Augmentation]:
|
289
270
|
return [target.augmentation for target in self.targets]
|
290
271
|
|
272
|
+
@property
|
273
|
+
def is_noise_only(self) -> bool:
|
274
|
+
return self.snr < -96
|
275
|
+
|
276
|
+
@property
|
277
|
+
def is_target_only(self) -> bool:
|
278
|
+
return self.snr > 96
|
291
279
|
|
292
|
-
|
280
|
+
def target_gain(self, target_index: int) -> float:
|
281
|
+
return (self.targets[target_index].gain if not self.is_noise_only else 0) * self.target_snr_gain
|
293
282
|
|
294
283
|
|
295
284
|
@dataclass(frozen=True)
|
@@ -304,7 +293,7 @@ class TransformConfig:
|
|
304
293
|
@dataclass(frozen=True)
|
305
294
|
class FeatureGeneratorConfig:
|
306
295
|
feature_mode: str
|
307
|
-
truth_parameters: dict[str, int]
|
296
|
+
truth_parameters: dict[str, int | None]
|
308
297
|
|
309
298
|
|
310
299
|
@dataclass(frozen=True)
|
@@ -328,13 +317,13 @@ class MixtureDatabaseConfig(DataClassSonusAIMixin):
|
|
328
317
|
class_labels: list[str]
|
329
318
|
class_weights_threshold: list[float]
|
330
319
|
feature: str
|
331
|
-
impulse_response_files:
|
332
|
-
mixtures:
|
320
|
+
impulse_response_files: list[ImpulseResponseFile]
|
321
|
+
mixtures: list[Mixture]
|
333
322
|
noise_mix_mode: str
|
334
|
-
noise_files:
|
323
|
+
noise_files: list[NoiseFile]
|
335
324
|
num_classes: int
|
336
|
-
spectral_masks:
|
337
|
-
target_files:
|
325
|
+
spectral_masks: list[SpectralMask]
|
326
|
+
target_files: list[TargetFile]
|
338
327
|
|
339
328
|
|
340
329
|
SpeechMetadata: TypeAlias = str | list[Interval] | None
|
sonusai/mixture/db_datatypes.py
CHANGED
@@ -35,7 +35,7 @@ SpectralMaskRecord = namedtuple(
|
|
35
35
|
["id", "f_max_width", "f_num", "t_max_width", "t_num", "t_max_percent"],
|
36
36
|
)
|
37
37
|
|
38
|
-
TargetRecord = namedtuple("TargetRecord", ["id", "file_id", "augmentation"
|
38
|
+
TargetRecord = namedtuple("TargetRecord", ["id", "file_id", "augmentation"])
|
39
39
|
|
40
40
|
MixtureRecord = namedtuple(
|
41
41
|
"MixtureRecord",
|
sonusai/mixture/feature.py
CHANGED
@@ -12,7 +12,6 @@ def get_feature_from_audio(
|
|
12
12
|
:param feature_mode: Feature mode
|
13
13
|
:return: Feature data [frames, strides, feature_parameters]
|
14
14
|
"""
|
15
|
-
import numpy as np
|
16
15
|
from pyaaware import FeatureGenerator
|
17
16
|
|
18
17
|
from .datatypes import TransformConfig
|
@@ -31,33 +30,14 @@ def get_feature_from_audio(
|
|
31
30
|
),
|
32
31
|
)
|
33
32
|
|
34
|
-
|
35
|
-
feature_frames = transform_frames // (fg.decimation * fg.step)
|
36
|
-
feature = np.empty((feature_frames, fg.stride, fg.feature_parameters), dtype=np.float32)
|
37
|
-
|
38
|
-
feature_frame = 0
|
39
|
-
for transform_frame in range(transform_frames):
|
40
|
-
fg.execute(audio_f[transform_frame])
|
41
|
-
|
42
|
-
if fg.eof():
|
43
|
-
feature[feature_frame] = fg.feature()
|
44
|
-
feature_frame += 1
|
33
|
+
return fg.execute_all(audio_f)[0]
|
45
34
|
|
46
|
-
return feature
|
47
35
|
|
48
|
-
|
49
|
-
def get_audio_from_feature(
|
50
|
-
feature: Feature,
|
51
|
-
feature_mode: str,
|
52
|
-
num_classes: int | None = 1,
|
53
|
-
truth_mutex: bool | None = False,
|
54
|
-
) -> AudioT:
|
36
|
+
def get_audio_from_feature(feature: Feature, feature_mode: str) -> AudioT:
|
55
37
|
"""Apply inverse transform to feature data to generate audio data
|
56
38
|
|
57
39
|
:param feature: Feature data [frames, stride=1, feature_parameters]
|
58
40
|
:param feature_mode: Feature mode
|
59
|
-
:param num_classes: Number of classes
|
60
|
-
:param truth_mutex: Whether to calculate 'other' label
|
61
41
|
:return: Audio data [samples]
|
62
42
|
"""
|
63
43
|
import numpy as np
|
@@ -75,7 +55,7 @@ def get_audio_from_feature(
|
|
75
55
|
if feature.shape[1] != 1:
|
76
56
|
raise ValueError("Strided feature data is not supported for audio extraction; stride must be 1.")
|
77
57
|
|
78
|
-
fg = FeatureGenerator(feature_mode=feature_mode
|
58
|
+
fg = FeatureGenerator(feature_mode=feature_mode)
|
79
59
|
|
80
60
|
feature_complex = unstack_complex(feature.squeeze())
|
81
61
|
if feature_mode[0:1] == "h":
|