sonusai 0.18.6__py3-none-any.whl → 0.18.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonusai/__init__.py +6 -0
- sonusai/genmetrics.py +4 -4
- sonusai/metrics/__init__.py +2 -1
- sonusai/metrics/calc_audio_stats.py +9 -1
- sonusai/metrics/calc_segsnr_f.py +84 -0
- sonusai/metrics/calc_speech.py +5 -5
- sonusai/mixture/__init__.py +3 -0
- sonusai/mixture/datatypes.py +65 -6
- sonusai/mixture/feature.py +4 -19
- sonusai/mixture/helpers.py +47 -38
- sonusai/mixture/mixdb.py +133 -40
- sonusai/mixture/sox_audio.py +125 -0
- sonusai/mixture/truth_functions/data.py +23 -22
- sonusai/mixture/truth_functions/energy.py +3 -1
- sonusai/mixture/truth_functions/sed.py +2 -1
- sonusai/mixture/truth_functions/target.py +3 -4
- sonusai/post_spenh_targetf.py +7 -7
- sonusai/utils/__init__.py +2 -0
- sonusai/utils/compress.py +25 -0
- sonusai/utils/energy_f.py +3 -4
- {sonusai-0.18.6.dist-info → sonusai-0.18.7.dist-info}/METADATA +1 -1
- {sonusai-0.18.6.dist-info → sonusai-0.18.7.dist-info}/RECORD +24 -23
- sonusai/metrics/calc_snr_f.py +0 -34
- {sonusai-0.18.6.dist-info → sonusai-0.18.7.dist-info}/WHEEL +0 -0
- {sonusai-0.18.6.dist-info → sonusai-0.18.7.dist-info}/entry_points.txt +0 -0
sonusai/__init__.py
CHANGED
@@ -2,6 +2,12 @@ import logging
|
|
2
2
|
from importlib import metadata
|
3
3
|
from os.path import dirname
|
4
4
|
|
5
|
+
from pyaaware import TorchForwardTransform
|
6
|
+
from pyaaware import TorchInverseTransform
|
7
|
+
|
8
|
+
ForwardTransform = TorchForwardTransform
|
9
|
+
InverseTransform = TorchInverseTransform
|
10
|
+
|
5
11
|
__version__ = metadata.version(__package__)
|
6
12
|
BASEDIR = dirname(__file__)
|
7
13
|
|
sonusai/genmetrics.py
CHANGED
@@ -115,11 +115,11 @@ def main() -> None:
|
|
115
115
|
mixdb = MixtureDatabase(location)
|
116
116
|
supported = mixdb.supported_metrics
|
117
117
|
if show_supported:
|
118
|
-
logger.info(f'\nSupported metrics
|
118
|
+
logger.info(f'\nSupported metrics:\n\n{supported.pretty}')
|
119
119
|
sys.exit(0)
|
120
120
|
|
121
121
|
if includes is None or 'all' in includes:
|
122
|
-
metrics = supported
|
122
|
+
metrics = supported.names
|
123
123
|
else:
|
124
124
|
metrics = set(includes)
|
125
125
|
if 'mxwer' in metrics:
|
@@ -127,7 +127,7 @@ def main() -> None:
|
|
127
127
|
for name in mixdb.asr_configs:
|
128
128
|
metrics.add(f'mxwer.{name}')
|
129
129
|
|
130
|
-
diff = metrics.difference(supported)
|
130
|
+
diff = metrics.difference(supported.names)
|
131
131
|
if diff:
|
132
132
|
logger.error(f'Unrecognized metric: {", ".join(diff)}')
|
133
133
|
sys.exit(1)
|
@@ -141,7 +141,7 @@ def main() -> None:
|
|
141
141
|
for name in mixdb.asr_configs:
|
142
142
|
_excludes.add(f'mxwer.{name}')
|
143
143
|
|
144
|
-
diff = _excludes.difference(supported)
|
144
|
+
diff = _excludes.difference(supported.names)
|
145
145
|
if diff:
|
146
146
|
logger.error(f'Unrecognized metric: {", ".join(diff)}')
|
147
147
|
sys.exit(1)
|
sonusai/metrics/__init__.py
CHANGED
@@ -8,7 +8,8 @@ from .calc_pesq import calc_pesq
|
|
8
8
|
from .calc_phase_distance import calc_phase_distance
|
9
9
|
from .calc_sa_sdr import calc_sa_sdr
|
10
10
|
from .calc_sample_weights import calc_sample_weights
|
11
|
-
from .
|
11
|
+
from .calc_segsnr_f import calc_segsnr_f
|
12
|
+
from .calc_segsnr_f import calc_segsnr_f_bin
|
12
13
|
from .calc_speech import calc_speech
|
13
14
|
from .calc_wer import calc_wer
|
14
15
|
from .calc_wsdr import calc_wsdr
|
@@ -2,6 +2,14 @@ from sonusai.mixture.datatypes import AudioStatsMetrics
|
|
2
2
|
from sonusai.mixture.datatypes import AudioT
|
3
3
|
|
4
4
|
|
5
|
+
def _convert_str_with_factors_to_int(x: str) -> int:
|
6
|
+
if 'k' in x:
|
7
|
+
return int(1000 * float(x.replace('k', '')))
|
8
|
+
if 'M' in x:
|
9
|
+
return int(1000000 * float(x.replace('M', '')))
|
10
|
+
return int(x)
|
11
|
+
|
12
|
+
|
5
13
|
def calc_audio_stats(audio: AudioT, win_len: float = None) -> AudioStatsMetrics:
|
6
14
|
from sonusai.mixture import SAMPLE_RATE
|
7
15
|
from sonusai.mixture import Transformer
|
@@ -38,5 +46,5 @@ def calc_audio_stats(audio: AudioT, win_len: float = None) -> AudioStatsMetrics:
|
|
38
46
|
tr=float(stats['RMS Tr dB']),
|
39
47
|
cr=float(stats['Crest factor']),
|
40
48
|
fl=float(stats['Flat factor']),
|
41
|
-
pkc=
|
49
|
+
pkc=_convert_str_with_factors_to_int(stats['Pk count']),
|
42
50
|
)
|
@@ -0,0 +1,84 @@
|
|
1
|
+
import numpy as np
|
2
|
+
|
3
|
+
from sonusai.mixture.datatypes import AudioF
|
4
|
+
from sonusai.mixture.datatypes import Segsnr
|
5
|
+
from sonusai.mixture.datatypes import SnrFBinMetrics
|
6
|
+
from sonusai.mixture.datatypes import SnrFMetrics
|
7
|
+
|
8
|
+
|
9
|
+
def calc_segsnr_f(segsnr_f: Segsnr) -> SnrFMetrics:
|
10
|
+
"""Calculate metrics of snr_f truth data.
|
11
|
+
|
12
|
+
Includes mean and standard deviation of the linear values (usually energy)
|
13
|
+
and mean and standard deviation of the dB values (10 * log10).
|
14
|
+
"""
|
15
|
+
if np.count_nonzero(segsnr_f) == 0:
|
16
|
+
# If all entries are zeros
|
17
|
+
return SnrFMetrics(0, 0, -np.inf, 0)
|
18
|
+
|
19
|
+
tmp = np.ma.array(segsnr_f, mask=np.logical_not(np.isfinite(segsnr_f)))
|
20
|
+
if np.ma.count_masked(tmp) == np.ma.size(tmp, axis=0):
|
21
|
+
# If all entries are infinite
|
22
|
+
return SnrFMetrics(np.inf, 0, np.inf, 0)
|
23
|
+
|
24
|
+
snr_mean = np.mean(tmp, axis=0)
|
25
|
+
snr_std = np.std(tmp, axis=0)
|
26
|
+
|
27
|
+
tmp = 10 * np.ma.log10(tmp)
|
28
|
+
if np.ma.count_masked(tmp) == np.ma.size(tmp, axis=0):
|
29
|
+
# If all entries are masked, special case where all inputs are either 0 or infinite
|
30
|
+
snr_db_mean = -np.inf
|
31
|
+
snr_db_std = np.inf
|
32
|
+
else:
|
33
|
+
snr_db_mean = np.mean(tmp, axis=0)
|
34
|
+
snr_db_std = np.std(tmp, axis=0)
|
35
|
+
|
36
|
+
return SnrFMetrics(snr_mean,
|
37
|
+
snr_std,
|
38
|
+
snr_db_mean,
|
39
|
+
snr_db_std)
|
40
|
+
|
41
|
+
|
42
|
+
def calc_segsnr_f_bin(target_f: AudioF, noise_f: AudioF) -> SnrFBinMetrics:
|
43
|
+
"""Calculate per-bin segmental SNR metrics.
|
44
|
+
|
45
|
+
Includes per-bin mean and standard deviation of the linear values
|
46
|
+
and mean and standard deviation of the dB values.
|
47
|
+
"""
|
48
|
+
if target_f.ndim != 2 and noise_f.ndim != 2:
|
49
|
+
raise ValueError('target_f and noise_f must have 2 dimensions')
|
50
|
+
|
51
|
+
segsnr_f = (np.abs(target_f) ** 2) / (np.abs(noise_f) ** 2)
|
52
|
+
|
53
|
+
frames, bins = segsnr_f.shape
|
54
|
+
if np.count_nonzero(segsnr_f) == 0:
|
55
|
+
# If all entries are zeros
|
56
|
+
return SnrFBinMetrics(np.zeros(bins),
|
57
|
+
np.zeros(bins),
|
58
|
+
-np.inf * np.ones(bins),
|
59
|
+
np.zeros(bins))
|
60
|
+
|
61
|
+
tmp = np.ma.array(segsnr_f, mask=np.logical_not(np.isfinite(segsnr_f)))
|
62
|
+
if np.ma.count_masked(tmp) == np.ma.size(tmp, axis=0):
|
63
|
+
# If all entries are infinite
|
64
|
+
return SnrFBinMetrics(np.inf * np.ones(bins),
|
65
|
+
np.zeros(bins),
|
66
|
+
np.inf * np.ones(bins),
|
67
|
+
np.zeros(bins))
|
68
|
+
|
69
|
+
snr_mean = np.mean(tmp, axis=0)
|
70
|
+
snr_std = np.std(tmp, axis=0)
|
71
|
+
|
72
|
+
tmp = 10 * np.ma.log10(tmp)
|
73
|
+
if np.ma.count_masked(tmp) == np.ma.size(tmp, axis=0):
|
74
|
+
# If all entries are masked, special case where all inputs are either 0 or infinite
|
75
|
+
snr_db_mean = -np.inf * np.ones(bins)
|
76
|
+
snr_db_std = np.inf * np.ones(bins)
|
77
|
+
else:
|
78
|
+
snr_db_mean = np.mean(tmp, axis=0)
|
79
|
+
snr_db_std = np.std(tmp, axis=0)
|
80
|
+
|
81
|
+
return SnrFBinMetrics(np.ma.getdata(snr_mean),
|
82
|
+
np.ma.getdata(snr_std),
|
83
|
+
np.ma.getdata(snr_db_mean),
|
84
|
+
np.ma.getdata(snr_db_std))
|
sonusai/metrics/calc_speech.py
CHANGED
@@ -6,7 +6,7 @@ from .calc_pesq import calc_pesq
|
|
6
6
|
|
7
7
|
|
8
8
|
def calc_speech(hypothesis: np.ndarray, reference: np.ndarray, sample_rate: int = SAMPLE_RATE) -> SpeechMetrics:
|
9
|
-
"""Calculate speech metrics pesq, c_sig, c_bak, c_ovl
|
9
|
+
"""Calculate speech metrics pesq, c_sig, c_bak, and c_ovl.
|
10
10
|
|
11
11
|
These are all related and thus included in one function. Reference: matlab script "compute_metrics.m".
|
12
12
|
|
@@ -38,11 +38,11 @@ def calc_speech(hypothesis: np.ndarray, reference: np.ndarray, sample_rate: int
|
|
38
38
|
_pesq = calc_pesq(hypothesis=hypothesis, reference=reference, sample_rate=sample_rate)
|
39
39
|
|
40
40
|
# Now compute the composite measures
|
41
|
-
|
42
|
-
|
43
|
-
|
41
|
+
csig = np.clip(3.093 - 1.029 * llr_mean + 0.603 * _pesq - 0.009 * wss_dist, 1, 5)
|
42
|
+
cbak = np.clip(1.634 + 0.478 * _pesq - 0.007 * wss_dist + 0.063 * seg_snr, 1, 5)
|
43
|
+
covl = np.clip(1.594 + 0.805 * _pesq - 0.512 * llr_mean - 0.007 * wss_dist, 1, 5)
|
44
44
|
|
45
|
-
return SpeechMetrics(_pesq,
|
45
|
+
return SpeechMetrics(_pesq, csig, cbak, covl)
|
46
46
|
|
47
47
|
|
48
48
|
def _calc_weighted_spectral_slope_measure(hypothesis: np.ndarray,
|
sonusai/mixture/__init__.py
CHANGED
@@ -66,6 +66,8 @@ from .datatypes import GeneralizedIDs
|
|
66
66
|
from .datatypes import ImpulseResponseData
|
67
67
|
from .datatypes import ImpulseResponseFiles
|
68
68
|
from .datatypes import ListAudiosT
|
69
|
+
from .datatypes import MetricDoc
|
70
|
+
from .datatypes import MetricDocs
|
69
71
|
from .datatypes import Mixture
|
70
72
|
from .datatypes import MixtureDatabaseConfig
|
71
73
|
from .datatypes import Mixtures
|
@@ -105,6 +107,7 @@ from .helpers import augmented_noise_samples
|
|
105
107
|
from .helpers import augmented_target_samples
|
106
108
|
from .helpers import check_audio_files_exist
|
107
109
|
from .helpers import forward_transform
|
110
|
+
from .helpers import frames_from_samples
|
108
111
|
from .helpers import get_audio_from_transform
|
109
112
|
from .helpers import get_ft
|
110
113
|
from .helpers import get_segsnr
|
sonusai/mixture/datatypes.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
from dataclasses import dataclass
|
2
2
|
from typing import Any
|
3
|
+
from typing import Iterable
|
3
4
|
from typing import NamedTuple
|
4
5
|
from typing import Optional
|
6
|
+
from typing import SupportsIndex
|
5
7
|
from typing import TypeAlias
|
6
8
|
|
7
9
|
import numpy as np
|
@@ -336,17 +338,24 @@ SpeechMetadata: TypeAlias = str | list[Interval] | None
|
|
336
338
|
|
337
339
|
|
338
340
|
class SnrFMetrics(NamedTuple):
|
339
|
-
|
340
|
-
|
341
|
-
|
341
|
+
avg: Optional[float] = None
|
342
|
+
std: Optional[float] = None
|
343
|
+
db_avg: Optional[float] = None
|
342
344
|
db_std: Optional[float] = None
|
343
345
|
|
344
346
|
|
347
|
+
class SnrFBinMetrics(NamedTuple):
|
348
|
+
avg: Optional[np.ndarray] = None
|
349
|
+
std: Optional[np.ndarray] = None
|
350
|
+
db_avg: Optional[np.ndarray] = None
|
351
|
+
db_std: Optional[np.ndarray] = None
|
352
|
+
|
353
|
+
|
345
354
|
class SpeechMetrics(NamedTuple):
|
346
355
|
pesq: Optional[float] = None
|
347
|
-
|
348
|
-
|
349
|
-
|
356
|
+
csig: Optional[float] = None
|
357
|
+
cbak: Optional[float] = None
|
358
|
+
covl: Optional[float] = None
|
350
359
|
|
351
360
|
|
352
361
|
class AudioStatsMetrics(NamedTuple):
|
@@ -360,3 +369,53 @@ class AudioStatsMetrics(NamedTuple):
|
|
360
369
|
cr: Optional[float] = None
|
361
370
|
fl: Optional[float] = None
|
362
371
|
pkc: Optional[float] = None
|
372
|
+
|
373
|
+
|
374
|
+
@dataclass
|
375
|
+
class MetricDoc:
|
376
|
+
category: str
|
377
|
+
name: str
|
378
|
+
description: str
|
379
|
+
|
380
|
+
|
381
|
+
class MetricDocs(list[MetricDoc]):
|
382
|
+
def __init__(self, __iterable: Iterable[MetricDoc]) -> None:
|
383
|
+
super().__init__(item for item in __iterable)
|
384
|
+
|
385
|
+
def __setitem__(self, __key: SupportsIndex, __value: MetricDoc) -> None: # type: ignore
|
386
|
+
super().__setitem__(__key, __value)
|
387
|
+
|
388
|
+
def insert(self, __index: SupportsIndex, __object: MetricDoc) -> None:
|
389
|
+
super().insert(__index, __object)
|
390
|
+
|
391
|
+
def append(self, __object: MetricDoc) -> None:
|
392
|
+
super().append(__object)
|
393
|
+
|
394
|
+
def extend(self, __iterable: Iterable[MetricDoc]) -> None:
|
395
|
+
if isinstance(__iterable, type(self)):
|
396
|
+
super().extend(__iterable)
|
397
|
+
else:
|
398
|
+
super().extend(item for item in __iterable)
|
399
|
+
|
400
|
+
@property
|
401
|
+
def pretty(self) -> str:
|
402
|
+
max_category_len = ((max([len(item.category) for item in self]) + 9) // 10) * 10
|
403
|
+
max_name_len = 2 + ((max([len(item.name) for item in self]) + 1) // 2) * 2
|
404
|
+
categories: list[str] = []
|
405
|
+
for item in self:
|
406
|
+
if item.category not in categories:
|
407
|
+
categories.append(item.category)
|
408
|
+
|
409
|
+
result = ''
|
410
|
+
for category in categories:
|
411
|
+
result += f'{category}\n'
|
412
|
+
result += '-' * max_category_len + '\n'
|
413
|
+
for item in [sub for sub in self if sub.category == category]:
|
414
|
+
result += f' {item.name:<{max_name_len}}{item.description}\n'
|
415
|
+
result += '\n'
|
416
|
+
|
417
|
+
return result
|
418
|
+
|
419
|
+
@property
|
420
|
+
def names(self) -> set[str]:
|
421
|
+
return set(item.name for item in self)
|
sonusai/mixture/feature.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
from typing import Optional
|
2
2
|
|
3
|
-
from sonusai.mixture.datatypes import AudioF
|
4
3
|
from sonusai.mixture.datatypes import AudioT
|
5
4
|
from sonusai.mixture.datatypes import Feature
|
6
5
|
|
@@ -58,15 +57,13 @@ def get_feature_from_audio(audio: AudioT,
|
|
58
57
|
def get_audio_from_feature(feature: Feature,
|
59
58
|
feature_mode: str,
|
60
59
|
num_classes: Optional[int] = 1,
|
61
|
-
truth_mutex: Optional[bool] = False
|
62
|
-
trim: Optional[bool] = True) -> AudioT:
|
60
|
+
truth_mutex: Optional[bool] = False) -> AudioT:
|
63
61
|
"""Apply inverse transform to feature data to generate audio data
|
64
62
|
|
65
63
|
:param feature: Feature data [frames, strides, feature_parameters]
|
66
64
|
:param feature_mode: Feature mode
|
67
65
|
:param num_classes: Number of classes
|
68
66
|
:param truth_mutex: Whether to calculate 'other' label
|
69
|
-
:param trim: Whether to trim the audio data
|
70
67
|
:return: Audio data [samples]
|
71
68
|
"""
|
72
69
|
import numpy as np
|
@@ -76,6 +73,7 @@ def get_audio_from_feature(feature: Feature,
|
|
76
73
|
from .datatypes import TransformConfig
|
77
74
|
from .helpers import inverse_transform
|
78
75
|
from sonusai.utils.stacked_complex import unstack_complex
|
76
|
+
from sonusai.utils.compress import power_uncompress
|
79
77
|
|
80
78
|
fg = FeatureGenerator(feature_mode=feature_mode,
|
81
79
|
num_classes=num_classes,
|
@@ -83,23 +81,10 @@ def get_audio_from_feature(feature: Feature,
|
|
83
81
|
|
84
82
|
feature_complex = unstack_complex(feature)
|
85
83
|
if feature_mode[0:1] == 'h':
|
86
|
-
feature_complex =
|
84
|
+
feature_complex = power_uncompress(feature_complex)
|
87
85
|
return np.squeeze(inverse_transform(transform=feature_complex,
|
88
86
|
config=TransformConfig(N=fg.itransform_N,
|
89
87
|
R=fg.itransform_R,
|
90
88
|
bin_start=fg.bin_start,
|
91
89
|
bin_end=fg.bin_end,
|
92
|
-
ttype=fg.itransform_ttype)
|
93
|
-
trim=trim))
|
94
|
-
|
95
|
-
|
96
|
-
def _power_uncompress(feature: AudioF) -> AudioF:
|
97
|
-
import numpy as np
|
98
|
-
|
99
|
-
mag = np.abs(feature)
|
100
|
-
phase = np.angle(feature)
|
101
|
-
mag = mag ** (1. / 0.3)
|
102
|
-
real_uncompress = mag * np.cos(phase)
|
103
|
-
imag_uncompress = mag * np.sin(phase)
|
104
|
-
|
105
|
-
return real_uncompress + 1j * imag_uncompress
|
90
|
+
ttype=fg.itransform_ttype)))
|
sonusai/mixture/helpers.py
CHANGED
@@ -2,9 +2,9 @@ from typing import Any
|
|
2
2
|
from typing import Optional
|
3
3
|
|
4
4
|
from praatio.utilities.constants import Interval
|
5
|
-
from pyaaware import ForwardTransform
|
6
|
-
from pyaaware import InverseTransform
|
7
5
|
|
6
|
+
from sonusai import ForwardTransform
|
7
|
+
from sonusai import InverseTransform
|
8
8
|
from sonusai.mixture import EnergyT
|
9
9
|
from sonusai.mixture.datatypes import AudioF
|
10
10
|
from sonusai.mixture.datatypes import AudioT
|
@@ -285,7 +285,10 @@ def read_mixture_data(name: str, items: list[str] | str) -> Any:
|
|
285
285
|
|
286
286
|
def _get_dataset(file: h5py.File, d_name: str) -> Any:
|
287
287
|
if d_name in file:
|
288
|
-
|
288
|
+
data = np.array(file[d_name])
|
289
|
+
if data.size == 1:
|
290
|
+
return data.item()
|
291
|
+
return data
|
289
292
|
return None
|
290
293
|
|
291
294
|
if not isinstance(items, list):
|
@@ -371,8 +374,8 @@ def get_ft(mixdb: MixtureDatabase, mixture: Mixture, mixture_audio: AudioT, trut
|
|
371
374
|
|
372
375
|
mixture_f = get_mixture_f(mixdb=mixdb, mixture=mixture, mixture_audio=mixture_audio)
|
373
376
|
|
374
|
-
transform_frames =
|
375
|
-
feature_frames =
|
377
|
+
transform_frames = frames_from_samples(mixture.samples, mixdb.ft_config.R)
|
378
|
+
feature_frames = frames_from_samples(mixture.samples, mixdb.feature_step_samples)
|
376
379
|
|
377
380
|
feature = np.empty((feature_frames, mixdb.fg_stride, mixdb.feature_parameters), dtype=np.float32)
|
378
381
|
truth_f = np.empty((feature_frames, mixdb.num_classes), dtype=np.complex64)
|
@@ -418,20 +421,21 @@ def get_segsnr_t(mixdb: MixtureDatabase, mixture: Mixture, target_audio: AudioT,
|
|
418
421
|
:return: segsnr_t data
|
419
422
|
"""
|
420
423
|
import numpy as np
|
421
|
-
|
424
|
+
import torch
|
425
|
+
from sonusai import ForwardTransform
|
422
426
|
|
423
427
|
from sonusai import SonusAIError
|
424
428
|
|
425
|
-
fft =
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
429
|
+
fft = ForwardTransform(N=mixdb.ft_config.N,
|
430
|
+
R=mixdb.ft_config.R,
|
431
|
+
bin_start=mixdb.ft_config.bin_start,
|
432
|
+
bin_end=mixdb.ft_config.bin_end,
|
433
|
+
ttype=mixdb.ft_config.ttype)
|
430
434
|
|
431
435
|
segsnr_t = np.empty(mixture.samples, dtype=np.float32)
|
432
436
|
|
433
|
-
|
434
|
-
|
437
|
+
target_energy = fft.execute_all(torch.from_numpy(target_audio))[1].numpy()
|
438
|
+
noise_energy = fft.execute_all(torch.from_numpy(noise_audio))[1].numpy()
|
435
439
|
|
436
440
|
offsets = range(0, mixture.samples, mixdb.ft_config.R)
|
437
441
|
if len(target_energy) != len(offsets):
|
@@ -505,8 +509,11 @@ def get_transform_from_audio(audio: AudioT, transform: ForwardTransform) -> tupl
|
|
505
509
|
:param transform: ForwardTransform object
|
506
510
|
:return: Frequency domain data [frames, bins], Energy [frames]
|
507
511
|
"""
|
508
|
-
|
509
|
-
|
512
|
+
import torch
|
513
|
+
|
514
|
+
f, e = transform.execute_all(torch.from_numpy(audio))
|
515
|
+
|
516
|
+
return f.numpy(), e.numpy()
|
510
517
|
|
511
518
|
|
512
519
|
def forward_transform(audio: AudioT, config: TransformConfig) -> AudioF:
|
@@ -518,54 +525,50 @@ def forward_transform(audio: AudioT, config: TransformConfig) -> AudioF:
|
|
518
525
|
:param config: Transform configuration
|
519
526
|
:return: Frequency domain data [frames, bins]
|
520
527
|
"""
|
521
|
-
from
|
528
|
+
from sonusai import ForwardTransform
|
522
529
|
|
523
530
|
audio_f, _ = get_transform_from_audio(audio=audio,
|
524
|
-
transform=
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
531
|
+
transform=ForwardTransform(N=config.N,
|
532
|
+
R=config.R,
|
533
|
+
bin_start=config.bin_start,
|
534
|
+
bin_end=config.bin_end,
|
535
|
+
ttype=config.ttype))
|
529
536
|
return audio_f
|
530
537
|
|
531
538
|
|
532
|
-
def get_audio_from_transform(data: AudioF, transform: InverseTransform
|
539
|
+
def get_audio_from_transform(data: AudioF, transform: InverseTransform) -> tuple[AudioT, EnergyT]:
|
533
540
|
"""Apply inverse transform to input transform data to generate audio data
|
534
541
|
|
535
542
|
:param data: Frequency domain data [frames, bins]
|
536
543
|
:param transform: InverseTransform object
|
537
|
-
:param trim: Removes starting samples so output waveform will be time-aligned with input waveform to the transform
|
538
544
|
:return: Time domain data [samples], Energy [frames]
|
539
545
|
"""
|
540
|
-
|
541
|
-
if trim:
|
542
|
-
t = t[transform.N - transform.R:]
|
546
|
+
import torch
|
543
547
|
|
544
|
-
|
548
|
+
t, e = transform.execute_all(torch.from_numpy(data))
|
545
549
|
|
550
|
+
return t.numpy(), e.numpy()
|
546
551
|
|
547
|
-
|
552
|
+
|
553
|
+
def inverse_transform(transform: AudioF, config: TransformConfig) -> AudioT:
|
548
554
|
"""Transform frequency domain data into time domain using the inverse transform config from the feature
|
549
555
|
|
550
556
|
A new transform is used for each call; i.e., state is not maintained between calls to inverse_transform().
|
551
557
|
|
552
558
|
:param transform: Frequency domain data [frames, bins]
|
553
559
|
:param config: Transform configuration
|
554
|
-
:param trim: Removes starting samples so output waveform will be time-aligned with input waveform to the
|
555
|
-
transform
|
556
560
|
:return: Time domain data [samples]
|
557
561
|
"""
|
558
562
|
import numpy as np
|
559
|
-
from
|
563
|
+
from sonusai import InverseTransform
|
560
564
|
|
561
565
|
audio, _ = get_audio_from_transform(data=transform,
|
562
|
-
transform=
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
trim=trim)
|
566
|
+
transform=InverseTransform(N=config.N,
|
567
|
+
R=config.R,
|
568
|
+
bin_start=config.bin_start,
|
569
|
+
bin_end=config.bin_end,
|
570
|
+
ttype=config.ttype,
|
571
|
+
gain=np.float32(1)))
|
569
572
|
return audio
|
570
573
|
|
571
574
|
|
@@ -641,3 +644,9 @@ def get_textgrid_tier_from_target_file(target_file: str, tier: str) -> Optional[
|
|
641
644
|
return list(entries)
|
642
645
|
else:
|
643
646
|
return entries[0].label
|
647
|
+
|
648
|
+
|
649
|
+
def frames_from_samples(samples: int, step_samples: int) -> int:
|
650
|
+
import numpy as np
|
651
|
+
|
652
|
+
return int(np.ceil(samples / step_samples))
|