sonusai 0.18.5__py3-none-any.whl → 0.18.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sonusai/__init__.py CHANGED
@@ -2,6 +2,12 @@ import logging
2
2
  from importlib import metadata
3
3
  from os.path import dirname
4
4
 
5
+ from pyaaware import TorchForwardTransform
6
+ from pyaaware import TorchInverseTransform
7
+
8
+ ForwardTransform = TorchForwardTransform
9
+ InverseTransform = TorchInverseTransform
10
+
5
11
  __version__ = metadata.version(__package__)
6
12
  BASEDIR = dirname(__file__)
7
13
 
sonusai/genmetrics.py CHANGED
@@ -115,11 +115,11 @@ def main() -> None:
115
115
  mixdb = MixtureDatabase(location)
116
116
  supported = mixdb.supported_metrics
117
117
  if show_supported:
118
- logger.info(f'\nSupported metrics: {", ".join(sorted(supported))}')
118
+ logger.info(f'\nSupported metrics:\n\n{supported.pretty}')
119
119
  sys.exit(0)
120
120
 
121
121
  if includes is None or 'all' in includes:
122
- metrics = supported
122
+ metrics = supported.names
123
123
  else:
124
124
  metrics = set(includes)
125
125
  if 'mxwer' in metrics:
@@ -127,7 +127,7 @@ def main() -> None:
127
127
  for name in mixdb.asr_configs:
128
128
  metrics.add(f'mxwer.{name}')
129
129
 
130
- diff = metrics.difference(supported)
130
+ diff = metrics.difference(supported.names)
131
131
  if diff:
132
132
  logger.error(f'Unrecognized metric: {", ".join(diff)}')
133
133
  sys.exit(1)
@@ -141,7 +141,7 @@ def main() -> None:
141
141
  for name in mixdb.asr_configs:
142
142
  _excludes.add(f'mxwer.{name}')
143
143
 
144
- diff = _excludes.difference(supported)
144
+ diff = _excludes.difference(supported.names)
145
145
  if diff:
146
146
  logger.error(f'Unrecognized metric: {", ".join(diff)}')
147
147
  sys.exit(1)
@@ -8,7 +8,8 @@ from .calc_pesq import calc_pesq
8
8
  from .calc_phase_distance import calc_phase_distance
9
9
  from .calc_sa_sdr import calc_sa_sdr
10
10
  from .calc_sample_weights import calc_sample_weights
11
- from .calc_snr_f import calc_snr_f
11
+ from .calc_segsnr_f import calc_segsnr_f
12
+ from .calc_segsnr_f import calc_segsnr_f_bin
12
13
  from .calc_speech import calc_speech
13
14
  from .calc_wer import calc_wer
14
15
  from .calc_wsdr import calc_wsdr
@@ -2,6 +2,14 @@ from sonusai.mixture.datatypes import AudioStatsMetrics
2
2
  from sonusai.mixture.datatypes import AudioT
3
3
 
4
4
 
5
+ def _convert_str_with_factors_to_int(x: str) -> int:
6
+ if 'k' in x:
7
+ return int(1000 * float(x.replace('k', '')))
8
+ if 'M' in x:
9
+ return int(1000000 * float(x.replace('M', '')))
10
+ return int(x)
11
+
12
+
5
13
  def calc_audio_stats(audio: AudioT, win_len: float = None) -> AudioStatsMetrics:
6
14
  from sonusai.mixture import SAMPLE_RATE
7
15
  from sonusai.mixture import Transformer
@@ -38,5 +46,5 @@ def calc_audio_stats(audio: AudioT, win_len: float = None) -> AudioStatsMetrics:
38
46
  tr=float(stats['RMS Tr dB']),
39
47
  cr=float(stats['Crest factor']),
40
48
  fl=float(stats['Flat factor']),
41
- pkc=int(stats['Pk count']),
49
+ pkc=_convert_str_with_factors_to_int(stats['Pk count']),
42
50
  )
@@ -0,0 +1,84 @@
1
+ import numpy as np
2
+
3
+ from sonusai.mixture.datatypes import AudioF
4
+ from sonusai.mixture.datatypes import Segsnr
5
+ from sonusai.mixture.datatypes import SnrFBinMetrics
6
+ from sonusai.mixture.datatypes import SnrFMetrics
7
+
8
+
9
+ def calc_segsnr_f(segsnr_f: Segsnr) -> SnrFMetrics:
10
+ """Calculate metrics of snr_f truth data.
11
+
12
+ Includes mean and standard deviation of the linear values (usually energy)
13
+ and mean and standard deviation of the dB values (10 * log10).
14
+ """
15
+ if np.count_nonzero(segsnr_f) == 0:
16
+ # If all entries are zeros
17
+ return SnrFMetrics(0, 0, -np.inf, 0)
18
+
19
+ tmp = np.ma.array(segsnr_f, mask=np.logical_not(np.isfinite(segsnr_f)))
20
+ if np.ma.count_masked(tmp) == np.ma.size(tmp, axis=0):
21
+ # If all entries are infinite
22
+ return SnrFMetrics(np.inf, 0, np.inf, 0)
23
+
24
+ snr_mean = np.mean(tmp, axis=0)
25
+ snr_std = np.std(tmp, axis=0)
26
+
27
+ tmp = 10 * np.ma.log10(tmp)
28
+ if np.ma.count_masked(tmp) == np.ma.size(tmp, axis=0):
29
+ # If all entries are masked, special case where all inputs are either 0 or infinite
30
+ snr_db_mean = -np.inf
31
+ snr_db_std = np.inf
32
+ else:
33
+ snr_db_mean = np.mean(tmp, axis=0)
34
+ snr_db_std = np.std(tmp, axis=0)
35
+
36
+ return SnrFMetrics(snr_mean,
37
+ snr_std,
38
+ snr_db_mean,
39
+ snr_db_std)
40
+
41
+
42
+ def calc_segsnr_f_bin(target_f: AudioF, noise_f: AudioF) -> SnrFBinMetrics:
43
+ """Calculate per-bin segmental SNR metrics.
44
+
45
+ Includes per-bin mean and standard deviation of the linear values
46
+ and mean and standard deviation of the dB values.
47
+ """
48
+ if target_f.ndim != 2 and noise_f.ndim != 2:
49
+ raise ValueError('target_f and noise_f must have 2 dimensions')
50
+
51
+ segsnr_f = (np.abs(target_f) ** 2) / (np.abs(noise_f) ** 2)
52
+
53
+ frames, bins = segsnr_f.shape
54
+ if np.count_nonzero(segsnr_f) == 0:
55
+ # If all entries are zeros
56
+ return SnrFBinMetrics(np.zeros(bins),
57
+ np.zeros(bins),
58
+ -np.inf * np.ones(bins),
59
+ np.zeros(bins))
60
+
61
+ tmp = np.ma.array(segsnr_f, mask=np.logical_not(np.isfinite(segsnr_f)))
62
+ if np.ma.count_masked(tmp) == np.ma.size(tmp, axis=0):
63
+ # If all entries are infinite
64
+ return SnrFBinMetrics(np.inf * np.ones(bins),
65
+ np.zeros(bins),
66
+ np.inf * np.ones(bins),
67
+ np.zeros(bins))
68
+
69
+ snr_mean = np.mean(tmp, axis=0)
70
+ snr_std = np.std(tmp, axis=0)
71
+
72
+ tmp = 10 * np.ma.log10(tmp)
73
+ if np.ma.count_masked(tmp) == np.ma.size(tmp, axis=0):
74
+ # If all entries are masked, special case where all inputs are either 0 or infinite
75
+ snr_db_mean = -np.inf * np.ones(bins)
76
+ snr_db_std = np.inf * np.ones(bins)
77
+ else:
78
+ snr_db_mean = np.mean(tmp, axis=0)
79
+ snr_db_std = np.std(tmp, axis=0)
80
+
81
+ return SnrFBinMetrics(np.ma.getdata(snr_mean),
82
+ np.ma.getdata(snr_std),
83
+ np.ma.getdata(snr_db_mean),
84
+ np.ma.getdata(snr_db_std))
@@ -6,7 +6,7 @@ from .calc_pesq import calc_pesq
6
6
 
7
7
 
8
8
  def calc_speech(hypothesis: np.ndarray, reference: np.ndarray, sample_rate: int = SAMPLE_RATE) -> SpeechMetrics:
9
- """Calculate speech metrics pesq, c_sig, c_bak, c_ovl, seg_snr.
9
+ """Calculate speech metrics pesq, c_sig, c_bak, and c_ovl.
10
10
 
11
11
  These are all related and thus included in one function. Reference: matlab script "compute_metrics.m".
12
12
 
@@ -38,11 +38,11 @@ def calc_speech(hypothesis: np.ndarray, reference: np.ndarray, sample_rate: int
38
38
  _pesq = calc_pesq(hypothesis=hypothesis, reference=reference, sample_rate=sample_rate)
39
39
 
40
40
  # Now compute the composite measures
41
- c_sig = np.clip(3.093 - 1.029 * llr_mean + 0.603 * _pesq - 0.009 * wss_dist, 1, 5)
42
- c_bak = np.clip(1.634 + 0.478 * _pesq - 0.007 * wss_dist + 0.063 * seg_snr, 1, 5)
43
- c_ovl = np.clip(1.594 + 0.805 * _pesq - 0.512 * llr_mean - 0.007 * wss_dist, 1, 5)
41
+ csig = np.clip(3.093 - 1.029 * llr_mean + 0.603 * _pesq - 0.009 * wss_dist, 1, 5)
42
+ cbak = np.clip(1.634 + 0.478 * _pesq - 0.007 * wss_dist + 0.063 * seg_snr, 1, 5)
43
+ covl = np.clip(1.594 + 0.805 * _pesq - 0.512 * llr_mean - 0.007 * wss_dist, 1, 5)
44
44
 
45
- return SpeechMetrics(_pesq, c_sig, c_bak, c_ovl)
45
+ return SpeechMetrics(_pesq, csig, cbak, covl)
46
46
 
47
47
 
48
48
  def _calc_weighted_spectral_slope_measure(hypothesis: np.ndarray,
@@ -66,6 +66,8 @@ from .datatypes import GeneralizedIDs
66
66
  from .datatypes import ImpulseResponseData
67
67
  from .datatypes import ImpulseResponseFiles
68
68
  from .datatypes import ListAudiosT
69
+ from .datatypes import MetricDoc
70
+ from .datatypes import MetricDocs
69
71
  from .datatypes import Mixture
70
72
  from .datatypes import MixtureDatabaseConfig
71
73
  from .datatypes import Mixtures
@@ -105,6 +107,7 @@ from .helpers import augmented_noise_samples
105
107
  from .helpers import augmented_target_samples
106
108
  from .helpers import check_audio_files_exist
107
109
  from .helpers import forward_transform
110
+ from .helpers import frames_from_samples
108
111
  from .helpers import get_audio_from_transform
109
112
  from .helpers import get_ft
110
113
  from .helpers import get_segsnr
@@ -1,7 +1,9 @@
1
1
  from dataclasses import dataclass
2
2
  from typing import Any
3
+ from typing import Iterable
3
4
  from typing import NamedTuple
4
5
  from typing import Optional
6
+ from typing import SupportsIndex
5
7
  from typing import TypeAlias
6
8
 
7
9
  import numpy as np
@@ -336,17 +338,24 @@ SpeechMetadata: TypeAlias = str | list[Interval] | None
336
338
 
337
339
 
338
340
  class SnrFMetrics(NamedTuple):
339
- mean: Optional[float] = None
340
- var: Optional[float] = None
341
- db_mean: Optional[float] = None
341
+ avg: Optional[float] = None
342
+ std: Optional[float] = None
343
+ db_avg: Optional[float] = None
342
344
  db_std: Optional[float] = None
343
345
 
344
346
 
347
+ class SnrFBinMetrics(NamedTuple):
348
+ avg: Optional[np.ndarray] = None
349
+ std: Optional[np.ndarray] = None
350
+ db_avg: Optional[np.ndarray] = None
351
+ db_std: Optional[np.ndarray] = None
352
+
353
+
345
354
  class SpeechMetrics(NamedTuple):
346
355
  pesq: Optional[float] = None
347
- c_sig: Optional[float] = None
348
- c_bak: Optional[float] = None
349
- c_ovl: Optional[float] = None
356
+ csig: Optional[float] = None
357
+ cbak: Optional[float] = None
358
+ covl: Optional[float] = None
350
359
 
351
360
 
352
361
  class AudioStatsMetrics(NamedTuple):
@@ -360,3 +369,53 @@ class AudioStatsMetrics(NamedTuple):
360
369
  cr: Optional[float] = None
361
370
  fl: Optional[float] = None
362
371
  pkc: Optional[float] = None
372
+
373
+
374
+ @dataclass
375
+ class MetricDoc:
376
+ category: str
377
+ name: str
378
+ description: str
379
+
380
+
381
+ class MetricDocs(list[MetricDoc]):
382
+ def __init__(self, __iterable: Iterable[MetricDoc]) -> None:
383
+ super().__init__(item for item in __iterable)
384
+
385
+ def __setitem__(self, __key: SupportsIndex, __value: MetricDoc) -> None: # type: ignore
386
+ super().__setitem__(__key, __value)
387
+
388
+ def insert(self, __index: SupportsIndex, __object: MetricDoc) -> None:
389
+ super().insert(__index, __object)
390
+
391
+ def append(self, __object: MetricDoc) -> None:
392
+ super().append(__object)
393
+
394
+ def extend(self, __iterable: Iterable[MetricDoc]) -> None:
395
+ if isinstance(__iterable, type(self)):
396
+ super().extend(__iterable)
397
+ else:
398
+ super().extend(item for item in __iterable)
399
+
400
+ @property
401
+ def pretty(self) -> str:
402
+ max_category_len = ((max([len(item.category) for item in self]) + 9) // 10) * 10
403
+ max_name_len = 2 + ((max([len(item.name) for item in self]) + 1) // 2) * 2
404
+ categories: list[str] = []
405
+ for item in self:
406
+ if item.category not in categories:
407
+ categories.append(item.category)
408
+
409
+ result = ''
410
+ for category in categories:
411
+ result += f'{category}\n'
412
+ result += '-' * max_category_len + '\n'
413
+ for item in [sub for sub in self if sub.category == category]:
414
+ result += f' {item.name:<{max_name_len}}{item.description}\n'
415
+ result += '\n'
416
+
417
+ return result
418
+
419
+ @property
420
+ def names(self) -> set[str]:
421
+ return set(item.name for item in self)
@@ -1,6 +1,5 @@
1
1
  from typing import Optional
2
2
 
3
- from sonusai.mixture.datatypes import AudioF
4
3
  from sonusai.mixture.datatypes import AudioT
5
4
  from sonusai.mixture.datatypes import Feature
6
5
 
@@ -58,15 +57,13 @@ def get_feature_from_audio(audio: AudioT,
58
57
  def get_audio_from_feature(feature: Feature,
59
58
  feature_mode: str,
60
59
  num_classes: Optional[int] = 1,
61
- truth_mutex: Optional[bool] = False,
62
- trim: Optional[bool] = True) -> AudioT:
60
+ truth_mutex: Optional[bool] = False) -> AudioT:
63
61
  """Apply inverse transform to feature data to generate audio data
64
62
 
65
63
  :param feature: Feature data [frames, strides, feature_parameters]
66
64
  :param feature_mode: Feature mode
67
65
  :param num_classes: Number of classes
68
66
  :param truth_mutex: Whether to calculate 'other' label
69
- :param trim: Whether to trim the audio data
70
67
  :return: Audio data [samples]
71
68
  """
72
69
  import numpy as np
@@ -76,6 +73,7 @@ def get_audio_from_feature(feature: Feature,
76
73
  from .datatypes import TransformConfig
77
74
  from .helpers import inverse_transform
78
75
  from sonusai.utils.stacked_complex import unstack_complex
76
+ from sonusai.utils.compress import power_uncompress
79
77
 
80
78
  fg = FeatureGenerator(feature_mode=feature_mode,
81
79
  num_classes=num_classes,
@@ -83,23 +81,10 @@ def get_audio_from_feature(feature: Feature,
83
81
 
84
82
  feature_complex = unstack_complex(feature)
85
83
  if feature_mode[0:1] == 'h':
86
- feature_complex = _power_uncompress(feature_complex)
84
+ feature_complex = power_uncompress(feature_complex)
87
85
  return np.squeeze(inverse_transform(transform=feature_complex,
88
86
  config=TransformConfig(N=fg.itransform_N,
89
87
  R=fg.itransform_R,
90
88
  bin_start=fg.bin_start,
91
89
  bin_end=fg.bin_end,
92
- ttype=fg.itransform_ttype),
93
- trim=trim))
94
-
95
-
96
- def _power_uncompress(feature: AudioF) -> AudioF:
97
- import numpy as np
98
-
99
- mag = np.abs(feature)
100
- phase = np.angle(feature)
101
- mag = mag ** (1. / 0.3)
102
- real_uncompress = mag * np.cos(phase)
103
- imag_uncompress = mag * np.sin(phase)
104
-
105
- return real_uncompress + 1j * imag_uncompress
90
+ ttype=fg.itransform_ttype)))
@@ -2,9 +2,9 @@ from typing import Any
2
2
  from typing import Optional
3
3
 
4
4
  from praatio.utilities.constants import Interval
5
- from pyaaware import ForwardTransform
6
- from pyaaware import InverseTransform
7
5
 
6
+ from sonusai import ForwardTransform
7
+ from sonusai import InverseTransform
8
8
  from sonusai.mixture import EnergyT
9
9
  from sonusai.mixture.datatypes import AudioF
10
10
  from sonusai.mixture.datatypes import AudioT
@@ -285,7 +285,10 @@ def read_mixture_data(name: str, items: list[str] | str) -> Any:
285
285
 
286
286
  def _get_dataset(file: h5py.File, d_name: str) -> Any:
287
287
  if d_name in file:
288
- return np.array(file[d_name])
288
+ data = np.array(file[d_name])
289
+ if data.size == 1:
290
+ return data.item()
291
+ return data
289
292
  return None
290
293
 
291
294
  if not isinstance(items, list):
@@ -371,8 +374,8 @@ def get_ft(mixdb: MixtureDatabase, mixture: Mixture, mixture_audio: AudioT, trut
371
374
 
372
375
  mixture_f = get_mixture_f(mixdb=mixdb, mixture=mixture, mixture_audio=mixture_audio)
373
376
 
374
- transform_frames = mixdb.mixture_transform_frames(mixture.samples)
375
- feature_frames = mixdb.mixture_feature_frames(mixture.samples)
377
+ transform_frames = frames_from_samples(mixture.samples, mixdb.ft_config.R)
378
+ feature_frames = frames_from_samples(mixture.samples, mixdb.feature_step_samples)
376
379
 
377
380
  feature = np.empty((feature_frames, mixdb.fg_stride, mixdb.feature_parameters), dtype=np.float32)
378
381
  truth_f = np.empty((feature_frames, mixdb.num_classes), dtype=np.complex64)
@@ -418,20 +421,21 @@ def get_segsnr_t(mixdb: MixtureDatabase, mixture: Mixture, target_audio: AudioT,
418
421
  :return: segsnr_t data
419
422
  """
420
423
  import numpy as np
421
- from pyaaware import AawareForwardTransform
424
+ import torch
425
+ from sonusai import ForwardTransform
422
426
 
423
427
  from sonusai import SonusAIError
424
428
 
425
- fft = AawareForwardTransform(N=mixdb.ft_config.N,
426
- R=mixdb.ft_config.R,
427
- bin_start=mixdb.ft_config.bin_start,
428
- bin_end=mixdb.ft_config.bin_end,
429
- ttype=mixdb.ft_config.ttype)
429
+ fft = ForwardTransform(N=mixdb.ft_config.N,
430
+ R=mixdb.ft_config.R,
431
+ bin_start=mixdb.ft_config.bin_start,
432
+ bin_end=mixdb.ft_config.bin_end,
433
+ ttype=mixdb.ft_config.ttype)
430
434
 
431
435
  segsnr_t = np.empty(mixture.samples, dtype=np.float32)
432
436
 
433
- _, target_energy = fft.execute_all(target_audio)
434
- _, noise_energy = fft.execute_all(noise_audio)
437
+ target_energy = fft.execute_all(torch.from_numpy(target_audio))[1].numpy()
438
+ noise_energy = fft.execute_all(torch.from_numpy(noise_audio))[1].numpy()
435
439
 
436
440
  offsets = range(0, mixture.samples, mixdb.ft_config.R)
437
441
  if len(target_energy) != len(offsets):
@@ -505,8 +509,11 @@ def get_transform_from_audio(audio: AudioT, transform: ForwardTransform) -> tupl
505
509
  :param transform: ForwardTransform object
506
510
  :return: Frequency domain data [frames, bins], Energy [frames]
507
511
  """
508
- f, e = transform.execute_all(audio)
509
- return f.transpose(), e
512
+ import torch
513
+
514
+ f, e = transform.execute_all(torch.from_numpy(audio))
515
+
516
+ return f.numpy(), e.numpy()
510
517
 
511
518
 
512
519
  def forward_transform(audio: AudioT, config: TransformConfig) -> AudioF:
@@ -518,54 +525,50 @@ def forward_transform(audio: AudioT, config: TransformConfig) -> AudioF:
518
525
  :param config: Transform configuration
519
526
  :return: Frequency domain data [frames, bins]
520
527
  """
521
- from pyaaware import AawareForwardTransform
528
+ from sonusai import ForwardTransform
522
529
 
523
530
  audio_f, _ = get_transform_from_audio(audio=audio,
524
- transform=AawareForwardTransform(N=config.N,
525
- R=config.R,
526
- bin_start=config.bin_start,
527
- bin_end=config.bin_end,
528
- ttype=config.ttype))
531
+ transform=ForwardTransform(N=config.N,
532
+ R=config.R,
533
+ bin_start=config.bin_start,
534
+ bin_end=config.bin_end,
535
+ ttype=config.ttype))
529
536
  return audio_f
530
537
 
531
538
 
532
- def get_audio_from_transform(data: AudioF, transform: InverseTransform, trim: bool = True) -> tuple[AudioT, EnergyT]:
539
+ def get_audio_from_transform(data: AudioF, transform: InverseTransform) -> tuple[AudioT, EnergyT]:
533
540
  """Apply inverse transform to input transform data to generate audio data
534
541
 
535
542
  :param data: Frequency domain data [frames, bins]
536
543
  :param transform: InverseTransform object
537
- :param trim: Removes starting samples so output waveform will be time-aligned with input waveform to the transform
538
544
  :return: Time domain data [samples], Energy [frames]
539
545
  """
540
- t, e = transform.execute_all(data.transpose())
541
- if trim:
542
- t = t[transform.N - transform.R:]
546
+ import torch
543
547
 
544
- return t, e
548
+ t, e = transform.execute_all(torch.from_numpy(data))
545
549
 
550
+ return t.numpy(), e.numpy()
546
551
 
547
- def inverse_transform(transform: AudioF, config: TransformConfig, trim: bool = True) -> AudioT:
552
+
553
+ def inverse_transform(transform: AudioF, config: TransformConfig) -> AudioT:
548
554
  """Transform frequency domain data into time domain using the inverse transform config from the feature
549
555
 
550
556
  A new transform is used for each call; i.e., state is not maintained between calls to inverse_transform().
551
557
 
552
558
  :param transform: Frequency domain data [frames, bins]
553
559
  :param config: Transform configuration
554
- :param trim: Removes starting samples so output waveform will be time-aligned with input waveform to the
555
- transform
556
560
  :return: Time domain data [samples]
557
561
  """
558
562
  import numpy as np
559
- from pyaaware import AawareInverseTransform
563
+ from sonusai import InverseTransform
560
564
 
561
565
  audio, _ = get_audio_from_transform(data=transform,
562
- transform=AawareInverseTransform(N=config.N,
563
- R=config.R,
564
- bin_start=config.bin_start,
565
- bin_end=config.bin_end,
566
- ttype=config.ttype,
567
- gain=np.float32(1)),
568
- trim=trim)
566
+ transform=InverseTransform(N=config.N,
567
+ R=config.R,
568
+ bin_start=config.bin_start,
569
+ bin_end=config.bin_end,
570
+ ttype=config.ttype,
571
+ gain=np.float32(1)))
569
572
  return audio
570
573
 
571
574
 
@@ -641,3 +644,9 @@ def get_textgrid_tier_from_target_file(target_file: str, tier: str) -> Optional[
641
644
  return list(entries)
642
645
  else:
643
646
  return entries[0].label
647
+
648
+
649
+ def frames_from_samples(samples: int, step_samples: int) -> int:
650
+ import numpy as np
651
+
652
+ return int(np.ceil(samples / step_samples))