sonusai 0.19.6__py3-none-any.whl → 0.19.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. sonusai/__init__.py +1 -1
  2. sonusai/aawscd_probwrite.py +1 -1
  3. sonusai/calc_metric_spenh.py +1 -1
  4. sonusai/genft.py +29 -14
  5. sonusai/genmetrics.py +60 -42
  6. sonusai/genmix.py +41 -29
  7. sonusai/genmixdb.py +56 -64
  8. sonusai/metrics/calc_class_weights.py +1 -3
  9. sonusai/metrics/calc_optimal_thresholds.py +2 -2
  10. sonusai/metrics/calc_phase_distance.py +1 -1
  11. sonusai/metrics/calc_speech.py +6 -6
  12. sonusai/metrics/class_summary.py +6 -15
  13. sonusai/metrics/confusion_matrix_summary.py +11 -27
  14. sonusai/metrics/one_hot.py +3 -3
  15. sonusai/metrics/snr_summary.py +7 -7
  16. sonusai/mixture/__init__.py +2 -17
  17. sonusai/mixture/augmentation.py +5 -6
  18. sonusai/mixture/class_count.py +1 -1
  19. sonusai/mixture/config.py +36 -46
  20. sonusai/mixture/data_io.py +30 -1
  21. sonusai/mixture/datatypes.py +29 -40
  22. sonusai/mixture/db_datatypes.py +1 -1
  23. sonusai/mixture/feature.py +3 -23
  24. sonusai/mixture/generation.py +161 -204
  25. sonusai/mixture/helpers.py +29 -187
  26. sonusai/mixture/mixdb.py +386 -159
  27. sonusai/mixture/soundfile_audio.py +1 -1
  28. sonusai/mixture/sox_audio.py +4 -4
  29. sonusai/mixture/sox_augmentation.py +1 -1
  30. sonusai/mixture/target_class_balancing.py +9 -11
  31. sonusai/mixture/targets.py +23 -20
  32. sonusai/mixture/torchaudio_audio.py +18 -7
  33. sonusai/mixture/torchaudio_augmentation.py +3 -4
  34. sonusai/mixture/truth.py +21 -34
  35. sonusai/mixture/truth_functions/__init__.py +6 -0
  36. sonusai/mixture/truth_functions/crm.py +51 -37
  37. sonusai/mixture/truth_functions/energy.py +95 -50
  38. sonusai/mixture/truth_functions/file.py +12 -8
  39. sonusai/mixture/truth_functions/metadata.py +24 -0
  40. sonusai/mixture/truth_functions/metrics.py +28 -0
  41. sonusai/mixture/truth_functions/phoneme.py +4 -5
  42. sonusai/mixture/truth_functions/sed.py +32 -23
  43. sonusai/mixture/truth_functions/target.py +62 -29
  44. sonusai/mkwav.py +20 -19
  45. sonusai/queries/queries.py +9 -15
  46. sonusai/speech/l2arctic.py +6 -2
  47. sonusai/summarize_metric_spenh.py +1 -1
  48. sonusai/utils/__init__.py +1 -0
  49. sonusai/utils/asr_functions/aaware_whisper.py +1 -1
  50. sonusai/utils/audio_devices.py +27 -18
  51. sonusai/utils/docstring.py +6 -3
  52. sonusai/utils/energy_f.py +5 -3
  53. sonusai/utils/human_readable_size.py +6 -6
  54. sonusai/utils/load_object.py +15 -0
  55. sonusai/utils/onnx_utils.py +2 -2
  56. sonusai/utils/print_mixture_details.py +3 -3
  57. {sonusai-0.19.6.dist-info → sonusai-0.19.9.dist-info}/METADATA +2 -2
  58. {sonusai-0.19.6.dist-info → sonusai-0.19.9.dist-info}/RECORD +60 -58
  59. sonusai/mixture/truth_functions/datatypes.py +0 -37
  60. {sonusai-0.19.6.dist-info → sonusai-0.19.9.dist-info}/WHEEL +0 -0
  61. {sonusai-0.19.6.dist-info → sonusai-0.19.9.dist-info}/entry_points.txt +0 -0
@@ -2,26 +2,19 @@ from pyaaware import ForwardTransform
2
2
  from pyaaware import InverseTransform
3
3
 
4
4
  from sonusai.mixture.datatypes import AudioF
5
- from sonusai.mixture.datatypes import AudiosT
6
5
  from sonusai.mixture.datatypes import AudioT
7
6
  from sonusai.mixture.datatypes import Augmentation
8
- from sonusai.mixture.datatypes import AugmentationRules
9
- from sonusai.mixture.datatypes import Augmentations
7
+ from sonusai.mixture.datatypes import AugmentationRule
10
8
  from sonusai.mixture.datatypes import EnergyT
11
- from sonusai.mixture.datatypes import Feature
12
9
  from sonusai.mixture.datatypes import FeatureGeneratorConfig
13
10
  from sonusai.mixture.datatypes import FeatureGeneratorInfo
14
11
  from sonusai.mixture.datatypes import GeneralizedIDs
15
12
  from sonusai.mixture.datatypes import Mixture
16
13
  from sonusai.mixture.datatypes import NoiseFile
17
- from sonusai.mixture.datatypes import NoiseFiles
18
- from sonusai.mixture.datatypes import Segsnr
19
14
  from sonusai.mixture.datatypes import SpeechMetadata
20
15
  from sonusai.mixture.datatypes import Target
21
- from sonusai.mixture.datatypes import TargetFiles
22
- from sonusai.mixture.datatypes import Targets
16
+ from sonusai.mixture.datatypes import TargetFile
23
17
  from sonusai.mixture.datatypes import TransformConfig
24
- from sonusai.mixture.datatypes import TruthDict
25
18
  from sonusai.mixture.db_datatypes import MixtureRecord
26
19
  from sonusai.mixture.db_datatypes import TargetRecord
27
20
  from sonusai.mixture.mixdb import MixtureDatabase
@@ -142,13 +135,14 @@ def mixture_all_speech_metadata(mixdb: MixtureDatabase, mixture: Mixture) -> lis
142
135
  return results
143
136
 
144
137
 
145
- def mixture_metadata(mixdb: MixtureDatabase, mixture: Mixture) -> str:
138
+ def mixture_metadata(mixdb: MixtureDatabase, m_id: int) -> str:
146
139
  """Create a string of metadata for a Mixture
147
140
 
148
141
  :param mixdb: Mixture database
149
- :param mixture: Mixture record
142
+ :param m_id: Mixture ID
150
143
  :return: String of metadata
151
144
  """
145
+ mixture = mixdb.mixture(m_id)
152
146
  metadata = ""
153
147
  speech_metadata = mixture_all_speech_metadata(mixdb, mixture)
154
148
  for mi, target in enumerate(mixture.targets):
@@ -157,7 +151,7 @@ def mixture_metadata(mixdb: MixtureDatabase, mixture: Mixture) -> str:
157
151
  metadata += f"target {mi} name: {target_file.name}\n"
158
152
  metadata += f"target {mi} augmentation: {target.augmentation.to_dict()}\n"
159
153
  metadata += f"target {mi} ir: {mixdb.impulse_response_file(target_augmentation.ir)}\n"
160
- metadata += f"target {mi} target_gain: {target.gain}\n"
154
+ metadata += f"target {mi} target_gain: {target.gain if not mixture.is_noise_only else 0}\n"
161
155
  metadata += f"target {mi} class indices: {target_file.class_indices}\n"
162
156
  for key in target_file.truth_configs:
163
157
  metadata += f"target {mi} truth '{key}' function: {target_file.truth_configs[key].function}\n"
@@ -169,7 +163,7 @@ def mixture_metadata(mixdb: MixtureDatabase, mixture: Mixture) -> str:
169
163
  metadata += f"noise name: {noise.name}\n"
170
164
  metadata += f"noise augmentation: {noise_augmentation.to_dict()}\n"
171
165
  metadata += f"noise ir: {mixdb.impulse_response_file(noise_augmentation.ir)}\n"
172
- metadata += f"noise offset: {mixture.noise.offset}\n"
166
+ metadata += f"noise offset: {mixture.noise_offset}\n"
173
167
  metadata += f"snr: {mixture.snr}\n"
174
168
  metadata += f"random_snr: {mixture.snr.is_random}\n"
175
169
  metadata += f"samples: {mixture.samples}\n"
@@ -179,17 +173,17 @@ def mixture_metadata(mixdb: MixtureDatabase, mixture: Mixture) -> str:
179
173
  return metadata
180
174
 
181
175
 
182
- def write_mixture_metadata(mixdb: MixtureDatabase, mixture: Mixture) -> None:
176
+ def write_mixture_metadata(mixdb: MixtureDatabase, m_id: int) -> None:
183
177
  """Write mixture metadata to a text file
184
178
 
185
179
  :param mixdb: Mixture database
186
- :param mixture: Mixture record
180
+ :param m_id: Mixture ID
187
181
  """
188
182
  from os.path import join
189
183
 
190
- name = join(mixdb.location, "mixture", mixture.name, "metadata.txt")
184
+ name = join(mixdb.location, "mixture", mixdb.mixture(m_id).name, "metadata.txt")
191
185
  with open(file=name, mode="w") as f:
192
- f.write(mixture_metadata(mixdb, mixture))
186
+ f.write(mixture_metadata(mixdb, m_id))
193
187
 
194
188
 
195
189
  def from_mixture(
@@ -199,7 +193,7 @@ def from_mixture(
199
193
  mixture.name,
200
194
  mixture.noise.file_id,
201
195
  mixture.noise.augmentation.to_json(),
202
- mixture.noise.offset,
196
+ mixture.noise_offset,
203
197
  mixture.noise_snr_gain,
204
198
  mixture.snr.is_random,
205
199
  mixture.snr,
@@ -210,7 +204,7 @@ def from_mixture(
210
204
  )
211
205
 
212
206
 
213
- def to_mixture(entry: MixtureRecord, targets: Targets) -> Mixture:
207
+ def to_mixture(entry: MixtureRecord, targets: list[Target]) -> Mixture:
214
208
  import json
215
209
 
216
210
  from sonusai.utils import dataclass_from_dict
@@ -223,9 +217,9 @@ def to_mixture(entry: MixtureRecord, targets: Targets) -> Mixture:
223
217
  name=entry.name,
224
218
  noise=Noise(
225
219
  file_id=entry.noise_file_id,
226
- augmentation=dataclass_from_dict(Augmentation, json.loads(entry.noise_augmentation)),
227
- offset=entry.noise_offset,
220
+ augmentation=dataclass_from_dict(Augmentation, json.loads(entry.noise_augmentation)), # pyright: ignore [reportArgumentType]
228
221
  ),
222
+ noise_offset=entry.noise_offset,
229
223
  noise_snr_gain=entry.noise_snr_gain,
230
224
  snr=UniversalSNR(is_random=entry.random_snr, value=entry.snr),
231
225
  samples=entry.samples,
@@ -235,8 +229,8 @@ def to_mixture(entry: MixtureRecord, targets: Targets) -> Mixture:
235
229
  )
236
230
 
237
231
 
238
- def from_target(target: Target) -> tuple[int, str, float]:
239
- return target.file_id, target.augmentation.to_json(), target.gain
232
+ def from_target(target: Target) -> tuple[int, str]:
233
+ return target.file_id, target.augmentation.to_json()
240
234
 
241
235
 
242
236
  def to_target(entry: TargetRecord) -> Target:
@@ -245,147 +239,14 @@ def to_target(entry: TargetRecord) -> Target:
245
239
  from sonusai.utils import dataclass_from_dict
246
240
 
247
241
  from .datatypes import Augmentation
248
- from .datatypes import Target
249
242
 
250
243
  return Target(
251
244
  file_id=entry.file_id,
252
- augmentation=dataclass_from_dict(Augmentation, json.loads(entry.augmentation)),
253
- gain=entry.gain,
254
- )
255
-
256
-
257
- def get_truth(
258
- mixdb: MixtureDatabase,
259
- mixture: Mixture,
260
- targets_audio: AudiosT,
261
- noise_audio: AudioT,
262
- mixture_audio: AudioT,
263
- ) -> TruthDict:
264
- """Get the truth data for the given mixture record
265
-
266
- :param mixdb: Mixture database
267
- :param mixture: Mixture record
268
- :param targets_audio: List of augmented target audio data (one per target in the mixup) for the given mixture ID
269
- :param noise_audio: Augmented noise audio data for the given mixture ID
270
- :param mixture_audio: Mixture audio data for the given mixture ID
271
- :return: truth data
272
- """
273
- from .datatypes import TruthDict
274
- from .truth import truth_function
275
-
276
- if not all(len(target) == mixture.samples for target in targets_audio):
277
- raise ValueError("Lengths of targets do not match length of mixture")
278
-
279
- if len(noise_audio) != mixture.samples:
280
- raise ValueError("Length of noise does not match length of mixture")
281
-
282
- # TODO: Need to understand how to do this correctly for mixup and target_mixture_f truth
283
- if len(targets_audio) != 1:
284
- raise NotImplementedError("mixup is not implemented")
285
-
286
- truth: TruthDict = {}
287
- for idx in range(len(targets_audio)):
288
- target_file = mixdb.target_file(mixture.targets[idx].file_id)
289
- for key, value in target_file.truth_configs.items():
290
- truth[key] = truth_function(
291
- target_audio=targets_audio[idx],
292
- noise_audio=noise_audio,
293
- mixture_audio=mixture_audio,
294
- config=value,
295
- feature=mixdb.feature,
296
- num_classes=mixdb.num_classes,
297
- class_indices=target_file.class_indices,
298
- target_gain=mixture.targets[idx].gain * mixture.target_snr_gain,
299
- )
300
-
301
- return truth
302
-
303
-
304
- def get_ft(
305
- mixdb: MixtureDatabase, mixture: Mixture, mixture_audio: AudioT, truth_t: TruthDict
306
- ) -> tuple[Feature, TruthDict]:
307
- """Get the feature and truth_f data for the given mixture record
308
-
309
- :param mixdb: Mixture database
310
- :param mixture: Mixture record
311
- :param mixture_audio: Mixture audio data for the given mixid
312
- :param truth_t: truth_t for the given mixid
313
- :return: Tuple of (feature, truth_f) data
314
- """
315
-
316
- from pyaaware import FeatureGenerator
317
-
318
- from .truth import truth_stride_reduction
319
-
320
- mixture_f = get_mixture_f(mixdb=mixdb, mixture=mixture, mixture_audio=mixture_audio)
321
-
322
- fg = FeatureGenerator(mixdb.fg_config.feature_mode, mixdb.fg_config.truth_parameters)
323
- feature, truth_f = fg.execute_all(mixture_f, truth_t)
324
- for name in truth_f:
325
- truth_f[name] = truth_stride_reduction(truth_f[name], mixdb.truth_configs[name].stride_reduction)
326
-
327
- return feature, truth_f
328
-
329
-
330
- def get_segsnr(mixdb: MixtureDatabase, mixture: Mixture, target_audio: AudioT, noise: AudioT) -> Segsnr:
331
- """Get the segsnr data for the given mixture record
332
-
333
- :param mixdb: Mixture database
334
- :param mixture: Mixture record
335
- :param target_audio: Augmented target audio data
336
- :param noise: Augmented noise audio data
337
- :return: segsnr data
338
- """
339
- segsnr_t = get_segsnr_t(mixdb=mixdb, mixture=mixture, target_audio=target_audio, noise_audio=noise)
340
- return segsnr_t[0 :: mixdb.ft_config.overlap]
341
-
342
-
343
- def get_segsnr_t(mixdb: MixtureDatabase, mixture: Mixture, target_audio: AudioT, noise_audio: AudioT) -> Segsnr:
344
- """Get the segsnr_t data for the given mixture record
345
-
346
- :param mixdb: Mixture database
347
- :param mixture: Mixture record
348
- :param target_audio: Augmented target audio data
349
- :param noise_audio: Augmented noise audio data
350
- :return: segsnr_t data
351
- """
352
- import numpy as np
353
- import torch
354
- from pyaaware import ForwardTransform
355
-
356
- fft = ForwardTransform(
357
- length=mixdb.ft_config.length,
358
- overlap=mixdb.ft_config.overlap,
359
- bin_start=mixdb.ft_config.bin_start,
360
- bin_end=mixdb.ft_config.bin_end,
361
- ttype=mixdb.ft_config.ttype,
245
+ augmentation=dataclass_from_dict(Augmentation, json.loads(entry.augmentation)), # pyright: ignore [reportArgumentType]
362
246
  )
363
247
 
364
- segsnr_t = np.empty(mixture.samples, dtype=np.float32)
365
-
366
- target_energy = fft.execute_all(torch.from_numpy(target_audio))[1].numpy()
367
- noise_energy = fft.execute_all(torch.from_numpy(noise_audio))[1].numpy()
368
-
369
- offsets = range(0, mixture.samples, mixdb.ft_config.overlap)
370
- if len(target_energy) != len(offsets):
371
- raise ValueError(
372
- f"Number of frames in energy, {len(target_energy)}," f" is not number of frames in mixture, {len(offsets)}"
373
- )
374
-
375
- for idx, offset in enumerate(offsets):
376
- indices = slice(offset, offset + mixdb.ft_config.overlap)
377
-
378
- if noise_energy[idx] == 0:
379
- snr = np.float32(np.inf)
380
- else:
381
- snr = np.float32(target_energy[idx] / noise_energy[idx])
382
-
383
- segsnr_t[indices] = snr
384
-
385
- return segsnr_t
386
248
 
387
-
388
- def get_target(mixdb: MixtureDatabase, mixture: Mixture, targets_audio: AudiosT) -> AudioT:
249
+ def get_target(mixdb: MixtureDatabase, mixture: Mixture, targets_audio: list[AudioT]) -> AudioT:
389
250
  """Get the augmented target audio data for the given mixture record
390
251
 
391
252
  :param mixdb: Mixture database
@@ -413,28 +274,6 @@ def get_target(mixdb: MixtureDatabase, mixture: Mixture, targets_audio: AudiosT)
413
274
  return np.sum(targets_ir, axis=0)
414
275
 
415
276
 
416
- def get_mixture_f(mixdb: MixtureDatabase, mixture: Mixture, mixture_audio: AudioT) -> AudioF:
417
- """Get the mixture transform for the given mixture
418
-
419
- :param mixdb: Mixture database
420
- :param mixture: Mixture record
421
- :param mixture_audio: Mixture audio data for the given mixid
422
- :return: Mixture transform data
423
- """
424
- from .spectral_mask import apply_spectral_mask
425
-
426
- mixture_f = forward_transform(mixture_audio, mixdb.ft_config)
427
-
428
- if mixture.spectral_mask_id is not None:
429
- mixture_f = apply_spectral_mask(
430
- audio_f=mixture_f,
431
- spectral_mask=mixdb.spectral_mask(mixture.spectral_mask_id),
432
- seed=mixture.spectral_mask_seed,
433
- )
434
-
435
- return mixture_f
436
-
437
-
438
277
  def get_transform_from_audio(audio: AudioT, transform: ForwardTransform) -> tuple[AudioF, EnergyT]:
439
278
  """Apply forward transform to input audio data to generate transform data
440
279
 
@@ -497,7 +336,6 @@ def inverse_transform(transform: AudioF, config: TransformConfig) -> AudioT:
497
336
  :param config: Transform configuration
498
337
  :return: Time domain data [samples]
499
338
  """
500
- import numpy as np
501
339
  from pyaaware import InverseTransform
502
340
 
503
341
  audio, _ = get_audio_from_transform(
@@ -508,7 +346,7 @@ def inverse_transform(transform: AudioF, config: TransformConfig) -> AudioT:
508
346
  bin_start=config.bin_start,
509
347
  bin_end=config.bin_end,
510
348
  ttype=config.ttype,
511
- gain=np.float32(1),
349
+ gain=1,
512
350
  ),
513
351
  )
514
352
  return audio
@@ -532,8 +370,8 @@ def check_audio_files_exist(mixdb: MixtureDatabase) -> None:
532
370
 
533
371
 
534
372
  def augmented_target_samples(
535
- target_files: TargetFiles,
536
- target_augmentations: AugmentationRules,
373
+ target_files: list[TargetFile],
374
+ target_augmentations: list[AugmentationRule],
537
375
  feature_step_samples: int,
538
376
  ) -> int:
539
377
  from itertools import product
@@ -555,7 +393,7 @@ def augmented_target_samples(
555
393
  )
556
394
 
557
395
 
558
- def augmented_noise_samples(noise_files: NoiseFiles, noise_augmentations: Augmentations) -> int:
396
+ def augmented_noise_samples(noise_files: list[NoiseFile], noise_augmentations: list[Augmentation]) -> int:
559
397
  from itertools import product
560
398
 
561
399
  noise_ids = list(range(len(noise_files)))
@@ -574,6 +412,7 @@ def get_textgrid_tier_from_target_file(target_file: str, tier: str) -> SpeechMet
574
412
  from pathlib import Path
575
413
 
576
414
  from praatio import textgrid
415
+ from praatio.utilities.constants import Interval
577
416
 
578
417
  from .tokenized_shell_vars import tokenized_expand
579
418
 
@@ -588,10 +427,13 @@ def get_textgrid_tier_from_target_file(target_file: str, tier: str) -> SpeechMet
588
427
 
589
428
  entries = tg.getTier(tier).entries
590
429
  if len(entries) > 1:
591
- return list(entries)
592
- else:
430
+ return [entry for entry in entries if isinstance(entry, Interval)]
431
+
432
+ if len(entries) == 1:
593
433
  return entries[0].label
594
434
 
435
+ return None
436
+
595
437
 
596
438
  def frames_from_samples(samples: int, step_samples: int) -> int:
597
439
  import numpy as np