sonusai 0.19.6__py3-none-any.whl → 0.19.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonusai/__init__.py +1 -1
- sonusai/aawscd_probwrite.py +1 -1
- sonusai/calc_metric_spenh.py +1 -1
- sonusai/genft.py +29 -14
- sonusai/genmetrics.py +60 -42
- sonusai/genmix.py +41 -29
- sonusai/genmixdb.py +56 -64
- sonusai/metrics/calc_class_weights.py +1 -3
- sonusai/metrics/calc_optimal_thresholds.py +2 -2
- sonusai/metrics/calc_phase_distance.py +1 -1
- sonusai/metrics/calc_speech.py +6 -6
- sonusai/metrics/class_summary.py +6 -15
- sonusai/metrics/confusion_matrix_summary.py +11 -27
- sonusai/metrics/one_hot.py +3 -3
- sonusai/metrics/snr_summary.py +7 -7
- sonusai/mixture/__init__.py +2 -17
- sonusai/mixture/augmentation.py +5 -6
- sonusai/mixture/class_count.py +1 -1
- sonusai/mixture/config.py +36 -46
- sonusai/mixture/data_io.py +30 -1
- sonusai/mixture/datatypes.py +29 -40
- sonusai/mixture/db_datatypes.py +1 -1
- sonusai/mixture/feature.py +3 -23
- sonusai/mixture/generation.py +161 -204
- sonusai/mixture/helpers.py +29 -187
- sonusai/mixture/mixdb.py +386 -159
- sonusai/mixture/soundfile_audio.py +1 -1
- sonusai/mixture/sox_audio.py +4 -4
- sonusai/mixture/sox_augmentation.py +1 -1
- sonusai/mixture/target_class_balancing.py +9 -11
- sonusai/mixture/targets.py +23 -20
- sonusai/mixture/torchaudio_audio.py +18 -7
- sonusai/mixture/torchaudio_augmentation.py +3 -4
- sonusai/mixture/truth.py +21 -34
- sonusai/mixture/truth_functions/__init__.py +6 -0
- sonusai/mixture/truth_functions/crm.py +51 -37
- sonusai/mixture/truth_functions/energy.py +95 -50
- sonusai/mixture/truth_functions/file.py +12 -8
- sonusai/mixture/truth_functions/metadata.py +24 -0
- sonusai/mixture/truth_functions/metrics.py +28 -0
- sonusai/mixture/truth_functions/phoneme.py +4 -5
- sonusai/mixture/truth_functions/sed.py +32 -23
- sonusai/mixture/truth_functions/target.py +62 -29
- sonusai/mkwav.py +20 -19
- sonusai/queries/queries.py +9 -15
- sonusai/speech/l2arctic.py +6 -2
- sonusai/summarize_metric_spenh.py +1 -1
- sonusai/utils/__init__.py +1 -0
- sonusai/utils/asr_functions/aaware_whisper.py +1 -1
- sonusai/utils/audio_devices.py +27 -18
- sonusai/utils/docstring.py +6 -3
- sonusai/utils/energy_f.py +5 -3
- sonusai/utils/human_readable_size.py +6 -6
- sonusai/utils/load_object.py +15 -0
- sonusai/utils/onnx_utils.py +2 -2
- sonusai/utils/print_mixture_details.py +3 -3
- {sonusai-0.19.6.dist-info → sonusai-0.19.9.dist-info}/METADATA +2 -2
- {sonusai-0.19.6.dist-info → sonusai-0.19.9.dist-info}/RECORD +60 -58
- sonusai/mixture/truth_functions/datatypes.py +0 -37
- {sonusai-0.19.6.dist-info → sonusai-0.19.9.dist-info}/WHEEL +0 -0
- {sonusai-0.19.6.dist-info → sonusai-0.19.9.dist-info}/entry_points.txt +0 -0
sonusai/mixture/helpers.py
CHANGED
@@ -2,26 +2,19 @@ from pyaaware import ForwardTransform
|
|
2
2
|
from pyaaware import InverseTransform
|
3
3
|
|
4
4
|
from sonusai.mixture.datatypes import AudioF
|
5
|
-
from sonusai.mixture.datatypes import AudiosT
|
6
5
|
from sonusai.mixture.datatypes import AudioT
|
7
6
|
from sonusai.mixture.datatypes import Augmentation
|
8
|
-
from sonusai.mixture.datatypes import
|
9
|
-
from sonusai.mixture.datatypes import Augmentations
|
7
|
+
from sonusai.mixture.datatypes import AugmentationRule
|
10
8
|
from sonusai.mixture.datatypes import EnergyT
|
11
|
-
from sonusai.mixture.datatypes import Feature
|
12
9
|
from sonusai.mixture.datatypes import FeatureGeneratorConfig
|
13
10
|
from sonusai.mixture.datatypes import FeatureGeneratorInfo
|
14
11
|
from sonusai.mixture.datatypes import GeneralizedIDs
|
15
12
|
from sonusai.mixture.datatypes import Mixture
|
16
13
|
from sonusai.mixture.datatypes import NoiseFile
|
17
|
-
from sonusai.mixture.datatypes import NoiseFiles
|
18
|
-
from sonusai.mixture.datatypes import Segsnr
|
19
14
|
from sonusai.mixture.datatypes import SpeechMetadata
|
20
15
|
from sonusai.mixture.datatypes import Target
|
21
|
-
from sonusai.mixture.datatypes import
|
22
|
-
from sonusai.mixture.datatypes import Targets
|
16
|
+
from sonusai.mixture.datatypes import TargetFile
|
23
17
|
from sonusai.mixture.datatypes import TransformConfig
|
24
|
-
from sonusai.mixture.datatypes import TruthDict
|
25
18
|
from sonusai.mixture.db_datatypes import MixtureRecord
|
26
19
|
from sonusai.mixture.db_datatypes import TargetRecord
|
27
20
|
from sonusai.mixture.mixdb import MixtureDatabase
|
@@ -142,13 +135,14 @@ def mixture_all_speech_metadata(mixdb: MixtureDatabase, mixture: Mixture) -> lis
|
|
142
135
|
return results
|
143
136
|
|
144
137
|
|
145
|
-
def mixture_metadata(mixdb: MixtureDatabase,
|
138
|
+
def mixture_metadata(mixdb: MixtureDatabase, m_id: int) -> str:
|
146
139
|
"""Create a string of metadata for a Mixture
|
147
140
|
|
148
141
|
:param mixdb: Mixture database
|
149
|
-
:param
|
142
|
+
:param m_id: Mixture ID
|
150
143
|
:return: String of metadata
|
151
144
|
"""
|
145
|
+
mixture = mixdb.mixture(m_id)
|
152
146
|
metadata = ""
|
153
147
|
speech_metadata = mixture_all_speech_metadata(mixdb, mixture)
|
154
148
|
for mi, target in enumerate(mixture.targets):
|
@@ -157,7 +151,7 @@ def mixture_metadata(mixdb: MixtureDatabase, mixture: Mixture) -> str:
|
|
157
151
|
metadata += f"target {mi} name: {target_file.name}\n"
|
158
152
|
metadata += f"target {mi} augmentation: {target.augmentation.to_dict()}\n"
|
159
153
|
metadata += f"target {mi} ir: {mixdb.impulse_response_file(target_augmentation.ir)}\n"
|
160
|
-
metadata += f"target {mi} target_gain: {target.gain}\n"
|
154
|
+
metadata += f"target {mi} target_gain: {target.gain if not mixture.is_noise_only else 0}\n"
|
161
155
|
metadata += f"target {mi} class indices: {target_file.class_indices}\n"
|
162
156
|
for key in target_file.truth_configs:
|
163
157
|
metadata += f"target {mi} truth '{key}' function: {target_file.truth_configs[key].function}\n"
|
@@ -169,7 +163,7 @@ def mixture_metadata(mixdb: MixtureDatabase, mixture: Mixture) -> str:
|
|
169
163
|
metadata += f"noise name: {noise.name}\n"
|
170
164
|
metadata += f"noise augmentation: {noise_augmentation.to_dict()}\n"
|
171
165
|
metadata += f"noise ir: {mixdb.impulse_response_file(noise_augmentation.ir)}\n"
|
172
|
-
metadata += f"noise offset: {mixture.
|
166
|
+
metadata += f"noise offset: {mixture.noise_offset}\n"
|
173
167
|
metadata += f"snr: {mixture.snr}\n"
|
174
168
|
metadata += f"random_snr: {mixture.snr.is_random}\n"
|
175
169
|
metadata += f"samples: {mixture.samples}\n"
|
@@ -179,17 +173,17 @@ def mixture_metadata(mixdb: MixtureDatabase, mixture: Mixture) -> str:
|
|
179
173
|
return metadata
|
180
174
|
|
181
175
|
|
182
|
-
def write_mixture_metadata(mixdb: MixtureDatabase,
|
176
|
+
def write_mixture_metadata(mixdb: MixtureDatabase, m_id: int) -> None:
|
183
177
|
"""Write mixture metadata to a text file
|
184
178
|
|
185
179
|
:param mixdb: Mixture database
|
186
|
-
:param
|
180
|
+
:param m_id: Mixture ID
|
187
181
|
"""
|
188
182
|
from os.path import join
|
189
183
|
|
190
|
-
name = join(mixdb.location, "mixture", mixture.name, "metadata.txt")
|
184
|
+
name = join(mixdb.location, "mixture", mixdb.mixture(m_id).name, "metadata.txt")
|
191
185
|
with open(file=name, mode="w") as f:
|
192
|
-
f.write(mixture_metadata(mixdb,
|
186
|
+
f.write(mixture_metadata(mixdb, m_id))
|
193
187
|
|
194
188
|
|
195
189
|
def from_mixture(
|
@@ -199,7 +193,7 @@ def from_mixture(
|
|
199
193
|
mixture.name,
|
200
194
|
mixture.noise.file_id,
|
201
195
|
mixture.noise.augmentation.to_json(),
|
202
|
-
mixture.
|
196
|
+
mixture.noise_offset,
|
203
197
|
mixture.noise_snr_gain,
|
204
198
|
mixture.snr.is_random,
|
205
199
|
mixture.snr,
|
@@ -210,7 +204,7 @@ def from_mixture(
|
|
210
204
|
)
|
211
205
|
|
212
206
|
|
213
|
-
def to_mixture(entry: MixtureRecord, targets:
|
207
|
+
def to_mixture(entry: MixtureRecord, targets: list[Target]) -> Mixture:
|
214
208
|
import json
|
215
209
|
|
216
210
|
from sonusai.utils import dataclass_from_dict
|
@@ -223,9 +217,9 @@ def to_mixture(entry: MixtureRecord, targets: Targets) -> Mixture:
|
|
223
217
|
name=entry.name,
|
224
218
|
noise=Noise(
|
225
219
|
file_id=entry.noise_file_id,
|
226
|
-
augmentation=dataclass_from_dict(Augmentation, json.loads(entry.noise_augmentation)),
|
227
|
-
offset=entry.noise_offset,
|
220
|
+
augmentation=dataclass_from_dict(Augmentation, json.loads(entry.noise_augmentation)), # pyright: ignore [reportArgumentType]
|
228
221
|
),
|
222
|
+
noise_offset=entry.noise_offset,
|
229
223
|
noise_snr_gain=entry.noise_snr_gain,
|
230
224
|
snr=UniversalSNR(is_random=entry.random_snr, value=entry.snr),
|
231
225
|
samples=entry.samples,
|
@@ -235,8 +229,8 @@ def to_mixture(entry: MixtureRecord, targets: Targets) -> Mixture:
|
|
235
229
|
)
|
236
230
|
|
237
231
|
|
238
|
-
def from_target(target: Target) -> tuple[int, str
|
239
|
-
return target.file_id, target.augmentation.to_json()
|
232
|
+
def from_target(target: Target) -> tuple[int, str]:
|
233
|
+
return target.file_id, target.augmentation.to_json()
|
240
234
|
|
241
235
|
|
242
236
|
def to_target(entry: TargetRecord) -> Target:
|
@@ -245,147 +239,14 @@ def to_target(entry: TargetRecord) -> Target:
|
|
245
239
|
from sonusai.utils import dataclass_from_dict
|
246
240
|
|
247
241
|
from .datatypes import Augmentation
|
248
|
-
from .datatypes import Target
|
249
242
|
|
250
243
|
return Target(
|
251
244
|
file_id=entry.file_id,
|
252
|
-
augmentation=dataclass_from_dict(Augmentation, json.loads(entry.augmentation)),
|
253
|
-
gain=entry.gain,
|
254
|
-
)
|
255
|
-
|
256
|
-
|
257
|
-
def get_truth(
|
258
|
-
mixdb: MixtureDatabase,
|
259
|
-
mixture: Mixture,
|
260
|
-
targets_audio: AudiosT,
|
261
|
-
noise_audio: AudioT,
|
262
|
-
mixture_audio: AudioT,
|
263
|
-
) -> TruthDict:
|
264
|
-
"""Get the truth data for the given mixture record
|
265
|
-
|
266
|
-
:param mixdb: Mixture database
|
267
|
-
:param mixture: Mixture record
|
268
|
-
:param targets_audio: List of augmented target audio data (one per target in the mixup) for the given mixture ID
|
269
|
-
:param noise_audio: Augmented noise audio data for the given mixture ID
|
270
|
-
:param mixture_audio: Mixture audio data for the given mixture ID
|
271
|
-
:return: truth data
|
272
|
-
"""
|
273
|
-
from .datatypes import TruthDict
|
274
|
-
from .truth import truth_function
|
275
|
-
|
276
|
-
if not all(len(target) == mixture.samples for target in targets_audio):
|
277
|
-
raise ValueError("Lengths of targets do not match length of mixture")
|
278
|
-
|
279
|
-
if len(noise_audio) != mixture.samples:
|
280
|
-
raise ValueError("Length of noise does not match length of mixture")
|
281
|
-
|
282
|
-
# TODO: Need to understand how to do this correctly for mixup and target_mixture_f truth
|
283
|
-
if len(targets_audio) != 1:
|
284
|
-
raise NotImplementedError("mixup is not implemented")
|
285
|
-
|
286
|
-
truth: TruthDict = {}
|
287
|
-
for idx in range(len(targets_audio)):
|
288
|
-
target_file = mixdb.target_file(mixture.targets[idx].file_id)
|
289
|
-
for key, value in target_file.truth_configs.items():
|
290
|
-
truth[key] = truth_function(
|
291
|
-
target_audio=targets_audio[idx],
|
292
|
-
noise_audio=noise_audio,
|
293
|
-
mixture_audio=mixture_audio,
|
294
|
-
config=value,
|
295
|
-
feature=mixdb.feature,
|
296
|
-
num_classes=mixdb.num_classes,
|
297
|
-
class_indices=target_file.class_indices,
|
298
|
-
target_gain=mixture.targets[idx].gain * mixture.target_snr_gain,
|
299
|
-
)
|
300
|
-
|
301
|
-
return truth
|
302
|
-
|
303
|
-
|
304
|
-
def get_ft(
|
305
|
-
mixdb: MixtureDatabase, mixture: Mixture, mixture_audio: AudioT, truth_t: TruthDict
|
306
|
-
) -> tuple[Feature, TruthDict]:
|
307
|
-
"""Get the feature and truth_f data for the given mixture record
|
308
|
-
|
309
|
-
:param mixdb: Mixture database
|
310
|
-
:param mixture: Mixture record
|
311
|
-
:param mixture_audio: Mixture audio data for the given mixid
|
312
|
-
:param truth_t: truth_t for the given mixid
|
313
|
-
:return: Tuple of (feature, truth_f) data
|
314
|
-
"""
|
315
|
-
|
316
|
-
from pyaaware import FeatureGenerator
|
317
|
-
|
318
|
-
from .truth import truth_stride_reduction
|
319
|
-
|
320
|
-
mixture_f = get_mixture_f(mixdb=mixdb, mixture=mixture, mixture_audio=mixture_audio)
|
321
|
-
|
322
|
-
fg = FeatureGenerator(mixdb.fg_config.feature_mode, mixdb.fg_config.truth_parameters)
|
323
|
-
feature, truth_f = fg.execute_all(mixture_f, truth_t)
|
324
|
-
for name in truth_f:
|
325
|
-
truth_f[name] = truth_stride_reduction(truth_f[name], mixdb.truth_configs[name].stride_reduction)
|
326
|
-
|
327
|
-
return feature, truth_f
|
328
|
-
|
329
|
-
|
330
|
-
def get_segsnr(mixdb: MixtureDatabase, mixture: Mixture, target_audio: AudioT, noise: AudioT) -> Segsnr:
|
331
|
-
"""Get the segsnr data for the given mixture record
|
332
|
-
|
333
|
-
:param mixdb: Mixture database
|
334
|
-
:param mixture: Mixture record
|
335
|
-
:param target_audio: Augmented target audio data
|
336
|
-
:param noise: Augmented noise audio data
|
337
|
-
:return: segsnr data
|
338
|
-
"""
|
339
|
-
segsnr_t = get_segsnr_t(mixdb=mixdb, mixture=mixture, target_audio=target_audio, noise_audio=noise)
|
340
|
-
return segsnr_t[0 :: mixdb.ft_config.overlap]
|
341
|
-
|
342
|
-
|
343
|
-
def get_segsnr_t(mixdb: MixtureDatabase, mixture: Mixture, target_audio: AudioT, noise_audio: AudioT) -> Segsnr:
|
344
|
-
"""Get the segsnr_t data for the given mixture record
|
345
|
-
|
346
|
-
:param mixdb: Mixture database
|
347
|
-
:param mixture: Mixture record
|
348
|
-
:param target_audio: Augmented target audio data
|
349
|
-
:param noise_audio: Augmented noise audio data
|
350
|
-
:return: segsnr_t data
|
351
|
-
"""
|
352
|
-
import numpy as np
|
353
|
-
import torch
|
354
|
-
from pyaaware import ForwardTransform
|
355
|
-
|
356
|
-
fft = ForwardTransform(
|
357
|
-
length=mixdb.ft_config.length,
|
358
|
-
overlap=mixdb.ft_config.overlap,
|
359
|
-
bin_start=mixdb.ft_config.bin_start,
|
360
|
-
bin_end=mixdb.ft_config.bin_end,
|
361
|
-
ttype=mixdb.ft_config.ttype,
|
245
|
+
augmentation=dataclass_from_dict(Augmentation, json.loads(entry.augmentation)), # pyright: ignore [reportArgumentType]
|
362
246
|
)
|
363
247
|
|
364
|
-
segsnr_t = np.empty(mixture.samples, dtype=np.float32)
|
365
|
-
|
366
|
-
target_energy = fft.execute_all(torch.from_numpy(target_audio))[1].numpy()
|
367
|
-
noise_energy = fft.execute_all(torch.from_numpy(noise_audio))[1].numpy()
|
368
|
-
|
369
|
-
offsets = range(0, mixture.samples, mixdb.ft_config.overlap)
|
370
|
-
if len(target_energy) != len(offsets):
|
371
|
-
raise ValueError(
|
372
|
-
f"Number of frames in energy, {len(target_energy)}," f" is not number of frames in mixture, {len(offsets)}"
|
373
|
-
)
|
374
|
-
|
375
|
-
for idx, offset in enumerate(offsets):
|
376
|
-
indices = slice(offset, offset + mixdb.ft_config.overlap)
|
377
|
-
|
378
|
-
if noise_energy[idx] == 0:
|
379
|
-
snr = np.float32(np.inf)
|
380
|
-
else:
|
381
|
-
snr = np.float32(target_energy[idx] / noise_energy[idx])
|
382
|
-
|
383
|
-
segsnr_t[indices] = snr
|
384
|
-
|
385
|
-
return segsnr_t
|
386
248
|
|
387
|
-
|
388
|
-
def get_target(mixdb: MixtureDatabase, mixture: Mixture, targets_audio: AudiosT) -> AudioT:
|
249
|
+
def get_target(mixdb: MixtureDatabase, mixture: Mixture, targets_audio: list[AudioT]) -> AudioT:
|
389
250
|
"""Get the augmented target audio data for the given mixture record
|
390
251
|
|
391
252
|
:param mixdb: Mixture database
|
@@ -413,28 +274,6 @@ def get_target(mixdb: MixtureDatabase, mixture: Mixture, targets_audio: AudiosT)
|
|
413
274
|
return np.sum(targets_ir, axis=0)
|
414
275
|
|
415
276
|
|
416
|
-
def get_mixture_f(mixdb: MixtureDatabase, mixture: Mixture, mixture_audio: AudioT) -> AudioF:
|
417
|
-
"""Get the mixture transform for the given mixture
|
418
|
-
|
419
|
-
:param mixdb: Mixture database
|
420
|
-
:param mixture: Mixture record
|
421
|
-
:param mixture_audio: Mixture audio data for the given mixid
|
422
|
-
:return: Mixture transform data
|
423
|
-
"""
|
424
|
-
from .spectral_mask import apply_spectral_mask
|
425
|
-
|
426
|
-
mixture_f = forward_transform(mixture_audio, mixdb.ft_config)
|
427
|
-
|
428
|
-
if mixture.spectral_mask_id is not None:
|
429
|
-
mixture_f = apply_spectral_mask(
|
430
|
-
audio_f=mixture_f,
|
431
|
-
spectral_mask=mixdb.spectral_mask(mixture.spectral_mask_id),
|
432
|
-
seed=mixture.spectral_mask_seed,
|
433
|
-
)
|
434
|
-
|
435
|
-
return mixture_f
|
436
|
-
|
437
|
-
|
438
277
|
def get_transform_from_audio(audio: AudioT, transform: ForwardTransform) -> tuple[AudioF, EnergyT]:
|
439
278
|
"""Apply forward transform to input audio data to generate transform data
|
440
279
|
|
@@ -497,7 +336,6 @@ def inverse_transform(transform: AudioF, config: TransformConfig) -> AudioT:
|
|
497
336
|
:param config: Transform configuration
|
498
337
|
:return: Time domain data [samples]
|
499
338
|
"""
|
500
|
-
import numpy as np
|
501
339
|
from pyaaware import InverseTransform
|
502
340
|
|
503
341
|
audio, _ = get_audio_from_transform(
|
@@ -508,7 +346,7 @@ def inverse_transform(transform: AudioF, config: TransformConfig) -> AudioT:
|
|
508
346
|
bin_start=config.bin_start,
|
509
347
|
bin_end=config.bin_end,
|
510
348
|
ttype=config.ttype,
|
511
|
-
gain=
|
349
|
+
gain=1,
|
512
350
|
),
|
513
351
|
)
|
514
352
|
return audio
|
@@ -532,8 +370,8 @@ def check_audio_files_exist(mixdb: MixtureDatabase) -> None:
|
|
532
370
|
|
533
371
|
|
534
372
|
def augmented_target_samples(
|
535
|
-
target_files:
|
536
|
-
target_augmentations:
|
373
|
+
target_files: list[TargetFile],
|
374
|
+
target_augmentations: list[AugmentationRule],
|
537
375
|
feature_step_samples: int,
|
538
376
|
) -> int:
|
539
377
|
from itertools import product
|
@@ -555,7 +393,7 @@ def augmented_target_samples(
|
|
555
393
|
)
|
556
394
|
|
557
395
|
|
558
|
-
def augmented_noise_samples(noise_files:
|
396
|
+
def augmented_noise_samples(noise_files: list[NoiseFile], noise_augmentations: list[Augmentation]) -> int:
|
559
397
|
from itertools import product
|
560
398
|
|
561
399
|
noise_ids = list(range(len(noise_files)))
|
@@ -574,6 +412,7 @@ def get_textgrid_tier_from_target_file(target_file: str, tier: str) -> SpeechMet
|
|
574
412
|
from pathlib import Path
|
575
413
|
|
576
414
|
from praatio import textgrid
|
415
|
+
from praatio.utilities.constants import Interval
|
577
416
|
|
578
417
|
from .tokenized_shell_vars import tokenized_expand
|
579
418
|
|
@@ -588,10 +427,13 @@ def get_textgrid_tier_from_target_file(target_file: str, tier: str) -> SpeechMet
|
|
588
427
|
|
589
428
|
entries = tg.getTier(tier).entries
|
590
429
|
if len(entries) > 1:
|
591
|
-
return
|
592
|
-
|
430
|
+
return [entry for entry in entries if isinstance(entry, Interval)]
|
431
|
+
|
432
|
+
if len(entries) == 1:
|
593
433
|
return entries[0].label
|
594
434
|
|
435
|
+
return None
|
436
|
+
|
595
437
|
|
596
438
|
def frames_from_samples(samples: int, step_samples: int) -> int:
|
597
439
|
import numpy as np
|