torchaudio 2.7.1__cp310-cp310-win_amd64.whl → 2.8.0__cp310-cp310-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchaudio might be problematic. Click here for more details.
- torchaudio/__init__.py +16 -5
- torchaudio/_backend/sox.py +2 -2
- torchaudio/_backend/utils.py +33 -0
- torchaudio/_internal/module_utils.py +59 -10
- torchaudio/_torchcodec.py +352 -0
- torchaudio/backend/no_backend.py +2 -2
- torchaudio/backend/soundfile_backend.py +2 -2
- torchaudio/backend/sox_io_backend.py +2 -2
- torchaudio/functional/__init__.py +6 -1
- torchaudio/functional/functional.py +7 -3
- torchaudio/io/__init__.py +10 -3
- torchaudio/kaldi_io.py +6 -0
- torchaudio/lib/_torchaudio.pyd +0 -0
- torchaudio/lib/libtorchaudio.pyd +0 -0
- torchaudio/models/decoder/__init__.py +7 -1
- torchaudio/pipelines/_tts/utils.py +3 -1
- torchaudio/prototype/datasets/musan.py +2 -1
- torchaudio/prototype/functional/_dsp.py +8 -0
- torchaudio/prototype/functional/_rir.py +3 -0
- torchaudio/prototype/functional/functional.py +3 -0
- torchaudio/prototype/models/__init__.py +4 -1
- torchaudio/prototype/models/_conformer_wav2vec2.py +7 -0
- torchaudio/prototype/models/_emformer_hubert.py +4 -0
- torchaudio/prototype/models/conv_emformer.py +4 -0
- torchaudio/prototype/models/hifi_gan.py +6 -0
- torchaudio/prototype/models/rnnt.py +6 -0
- torchaudio/prototype/models/rnnt_decoder.py +3 -0
- torchaudio/prototype/pipelines/__init__.py +11 -2
- torchaudio/prototype/pipelines/_vggish/__init__.py +5 -1
- torchaudio/prototype/pipelines/_vggish/_vggish_impl.py +4 -1
- torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py +3 -2
- torchaudio/prototype/pipelines/hifigan_pipeline.py +5 -0
- torchaudio/prototype/transforms/_transforms.py +6 -1
- torchaudio/sox_effects/sox_effects.py +4 -1
- torchaudio/transforms/__init__.py +3 -1
- torchaudio/transforms/_transforms.py +3 -2
- torchaudio/utils/download.py +2 -0
- torchaudio/utils/sox_utils.py +19 -0
- torchaudio/version.py +2 -2
- {torchaudio-2.7.1.dist-info → torchaudio-2.8.0.dist-info}/METADATA +13 -2
- {torchaudio-2.7.1.dist-info → torchaudio-2.8.0.dist-info}/RECORD +52 -51
- torio/io/_streaming_media_decoder.py +0 -1
- torio/lib/_torio_ffmpeg4.pyd +0 -0
- torio/lib/_torio_ffmpeg5.pyd +0 -0
- torio/lib/_torio_ffmpeg6.pyd +0 -0
- torio/lib/libtorio_ffmpeg4.pyd +0 -0
- torio/lib/libtorio_ffmpeg5.pyd +0 -0
- torio/lib/libtorio_ffmpeg6.pyd +0 -0
- torio/utils/ffmpeg_utils.py +28 -0
- {torchaudio-2.7.1.dist-info → torchaudio-2.8.0.dist-info}/WHEEL +0 -0
- {torchaudio-2.7.1.dist-info → torchaudio-2.8.0.dist-info}/licenses/LICENSE +0 -0
- {torchaudio-2.7.1.dist-info → torchaudio-2.8.0.dist-info}/top_level.txt +0 -0
torchaudio/io/__init__.py
CHANGED
|
@@ -1,7 +1,14 @@
|
|
|
1
|
-
from torio.io import CodecConfig, StreamingMediaDecoder as
|
|
1
|
+
from torio.io import CodecConfig as _CodecConfig, StreamingMediaDecoder as _StreamReader, StreamingMediaEncoder as _StreamWriter
|
|
2
|
+
from torchaudio._internal.module_utils import dropping_class_io_support, dropping_class_support, dropping_io_support
|
|
2
3
|
|
|
3
|
-
from ._effector import AudioEffector
|
|
4
|
-
from ._playback import play_audio
|
|
4
|
+
from ._effector import AudioEffector as _AudioEffector
|
|
5
|
+
from ._playback import play_audio as _play_audio
|
|
6
|
+
|
|
7
|
+
CodecConfig = dropping_class_io_support(_CodecConfig)
|
|
8
|
+
StreamReader = dropping_class_io_support(_StreamReader)
|
|
9
|
+
StreamWriter = dropping_class_io_support(_StreamWriter)
|
|
10
|
+
AudioEffector = dropping_class_support(_AudioEffector)
|
|
11
|
+
play_audio = dropping_io_support(_play_audio)
|
|
5
12
|
|
|
6
13
|
|
|
7
14
|
__all__ = [
|
torchaudio/kaldi_io.py
CHANGED
|
@@ -6,6 +6,7 @@ from typing import Any, Callable, Iterable, Tuple
|
|
|
6
6
|
import torch
|
|
7
7
|
from torch import Tensor
|
|
8
8
|
from torchaudio._internal import module_utils as _mod_utils
|
|
9
|
+
from torchaudio._internal.module_utils import dropping_support
|
|
9
10
|
|
|
10
11
|
if _mod_utils.is_module_available("numpy"):
|
|
11
12
|
import numpy as np
|
|
@@ -41,6 +42,7 @@ def _convert_method_output_to_tensor(
|
|
|
41
42
|
yield key, torch.from_numpy(np_arr)
|
|
42
43
|
|
|
43
44
|
|
|
45
|
+
@dropping_support
|
|
44
46
|
@_mod_utils.requires_module("kaldi_io", "numpy")
|
|
45
47
|
def read_vec_int_ark(file_or_fd: Any) -> Iterable[Tuple[str, Tensor]]:
|
|
46
48
|
r"""Create generator of (key,vector<int>) tuples, which reads from the ark file/stream.
|
|
@@ -64,6 +66,7 @@ def read_vec_int_ark(file_or_fd: Any) -> Iterable[Tuple[str, Tensor]]:
|
|
|
64
66
|
return _convert_method_output_to_tensor(file_or_fd, kaldi_io.read_vec_int_ark, convert_contiguous=True)
|
|
65
67
|
|
|
66
68
|
|
|
69
|
+
@dropping_support
|
|
67
70
|
@_mod_utils.requires_module("kaldi_io", "numpy")
|
|
68
71
|
def read_vec_flt_scp(file_or_fd: Any) -> Iterable[Tuple[str, Tensor]]:
|
|
69
72
|
r"""Create generator of (key,vector<float32/float64>) tuples, read according to Kaldi scp.
|
|
@@ -84,6 +87,7 @@ def read_vec_flt_scp(file_or_fd: Any) -> Iterable[Tuple[str, Tensor]]:
|
|
|
84
87
|
return _convert_method_output_to_tensor(file_or_fd, kaldi_io.read_vec_flt_scp)
|
|
85
88
|
|
|
86
89
|
|
|
90
|
+
@dropping_support
|
|
87
91
|
@_mod_utils.requires_module("kaldi_io", "numpy")
|
|
88
92
|
def read_vec_flt_ark(file_or_fd: Any) -> Iterable[Tuple[str, Tensor]]:
|
|
89
93
|
r"""Create generator of (key,vector<float32/float64>) tuples, which reads from the ark file/stream.
|
|
@@ -104,6 +108,7 @@ def read_vec_flt_ark(file_or_fd: Any) -> Iterable[Tuple[str, Tensor]]:
|
|
|
104
108
|
return _convert_method_output_to_tensor(file_or_fd, kaldi_io.read_vec_flt_ark)
|
|
105
109
|
|
|
106
110
|
|
|
111
|
+
@dropping_support
|
|
107
112
|
@_mod_utils.requires_module("kaldi_io", "numpy")
|
|
108
113
|
def read_mat_scp(file_or_fd: Any) -> Iterable[Tuple[str, Tensor]]:
|
|
109
114
|
r"""Create generator of (key,matrix<float32/float64>) tuples, read according to Kaldi scp.
|
|
@@ -124,6 +129,7 @@ def read_mat_scp(file_or_fd: Any) -> Iterable[Tuple[str, Tensor]]:
|
|
|
124
129
|
return _convert_method_output_to_tensor(file_or_fd, kaldi_io.read_mat_scp)
|
|
125
130
|
|
|
126
131
|
|
|
132
|
+
@dropping_support
|
|
127
133
|
@_mod_utils.requires_module("kaldi_io", "numpy")
|
|
128
134
|
def read_mat_ark(file_or_fd: Any) -> Iterable[Tuple[str, Tensor]]:
|
|
129
135
|
r"""Create generator of (key,matrix<float32/float64>) tuples, which reads from the ark file/stream.
|
torchaudio/lib/_torchaudio.pyd
CHANGED
|
Binary file
|
torchaudio/lib/libtorchaudio.pyd
CHANGED
|
Binary file
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from torchaudio._internal.module_utils import dropping_support, dropping_class_support
|
|
2
|
+
import inspect
|
|
1
3
|
_CTC_DECODERS = [
|
|
2
4
|
"CTCHypothesis",
|
|
3
5
|
"CTCDecoder",
|
|
@@ -33,7 +35,11 @@ def __getattr__(name: str):
|
|
|
33
35
|
"To use CUCTC decoder, please set BUILD_CUDA_CTC_DECODER=1 when building from source."
|
|
34
36
|
) from err
|
|
35
37
|
|
|
36
|
-
|
|
38
|
+
orig_item = getattr(_cuda_ctc_decoder, name)
|
|
39
|
+
if inspect.isclass(orig_item):
|
|
40
|
+
item = dropping_class_support(orig_item)
|
|
41
|
+
else:
|
|
42
|
+
item = dropping_support(orig_item)
|
|
37
43
|
globals()[name] = item
|
|
38
44
|
return item
|
|
39
45
|
raise AttributeError(f"module {__name__} has no attribute {name}")
|
|
@@ -161,6 +161,7 @@ def _load_phonemizer(file, dl_kwargs):
|
|
|
161
161
|
raise RuntimeError("DeepPhonemizer is not installed. Please install it.")
|
|
162
162
|
|
|
163
163
|
from dp.phonemizer import Phonemizer
|
|
164
|
+
from dp.preprocessing.text import Preprocessor, LanguageTokenizer, SequenceTokenizer
|
|
164
165
|
|
|
165
166
|
# By default, dp issues DEBUG level log.
|
|
166
167
|
logger = logging.getLogger("dp")
|
|
@@ -174,7 +175,8 @@ def _load_phonemizer(file, dl_kwargs):
|
|
|
174
175
|
if not os.path.exists(path):
|
|
175
176
|
dl_kwargs = {} if dl_kwargs is None else dl_kwargs
|
|
176
177
|
download_url_to_file(url, path, **dl_kwargs)
|
|
177
|
-
|
|
178
|
+
with torch.serialization.safe_globals([Preprocessor, LanguageTokenizer, SequenceTokenizer]):
|
|
179
|
+
return Phonemizer.from_checkpoint(path)
|
|
178
180
|
finally:
|
|
179
181
|
logger.setLevel(orig_level)
|
|
180
182
|
|
|
@@ -4,12 +4,13 @@ from typing import Tuple, Union
|
|
|
4
4
|
import torch
|
|
5
5
|
from torch.utils.data import Dataset
|
|
6
6
|
from torchaudio.datasets.utils import _load_waveform
|
|
7
|
+
from torchaudio._internal.module_utils import dropping_support, dropping_class_support
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
_SUBSETS = ["music", "noise", "speech"]
|
|
10
11
|
_SAMPLE_RATE = 16_000
|
|
11
12
|
|
|
12
|
-
|
|
13
|
+
@dropping_class_support
|
|
13
14
|
class Musan(Dataset):
|
|
14
15
|
r"""*MUSAN* :cite:`musan2015` dataset.
|
|
15
16
|
|
|
@@ -4,8 +4,10 @@ from typing import List, Optional, Union
|
|
|
4
4
|
import torch
|
|
5
5
|
|
|
6
6
|
from torchaudio.functional import fftconvolve
|
|
7
|
+
from torchaudio._internal.module_utils import dropping_support
|
|
7
8
|
|
|
8
9
|
|
|
10
|
+
@dropping_support
|
|
9
11
|
def oscillator_bank(
|
|
10
12
|
frequencies: torch.Tensor,
|
|
11
13
|
amplitudes: torch.Tensor,
|
|
@@ -81,6 +83,7 @@ def oscillator_bank(
|
|
|
81
83
|
return waveform
|
|
82
84
|
|
|
83
85
|
|
|
86
|
+
@dropping_support
|
|
84
87
|
def adsr_envelope(
|
|
85
88
|
num_frames: int,
|
|
86
89
|
*,
|
|
@@ -182,6 +185,7 @@ def adsr_envelope(
|
|
|
182
185
|
return out
|
|
183
186
|
|
|
184
187
|
|
|
188
|
+
@dropping_support
|
|
185
189
|
def extend_pitch(
|
|
186
190
|
base: torch.Tensor,
|
|
187
191
|
pattern: Union[int, List[float], torch.Tensor],
|
|
@@ -249,6 +253,7 @@ def extend_pitch(
|
|
|
249
253
|
return h_freq
|
|
250
254
|
|
|
251
255
|
|
|
256
|
+
@dropping_support
|
|
252
257
|
def sinc_impulse_response(cutoff: torch.Tensor, window_size: int = 513, high_pass: bool = False):
|
|
253
258
|
"""Create windowed-sinc impulse response for given cutoff frequencies.
|
|
254
259
|
|
|
@@ -288,6 +293,7 @@ def sinc_impulse_response(cutoff: torch.Tensor, window_size: int = 513, high_pas
|
|
|
288
293
|
return filt
|
|
289
294
|
|
|
290
295
|
|
|
296
|
+
@dropping_support
|
|
291
297
|
def frequency_impulse_response(magnitudes):
|
|
292
298
|
"""Create filter from desired frequency response
|
|
293
299
|
|
|
@@ -319,6 +325,7 @@ def _overlap_and_add(waveform, stride):
|
|
|
319
325
|
return buffer
|
|
320
326
|
|
|
321
327
|
|
|
328
|
+
@dropping_support
|
|
322
329
|
def filter_waveform(waveform: torch.Tensor, kernels: torch.Tensor, delay_compensation: int = -1):
|
|
323
330
|
"""Applies filters along time axis of the given waveform.
|
|
324
331
|
|
|
@@ -404,6 +411,7 @@ def filter_waveform(waveform: torch.Tensor, kernels: torch.Tensor, delay_compens
|
|
|
404
411
|
return result
|
|
405
412
|
|
|
406
413
|
|
|
414
|
+
@dropping_support
|
|
407
415
|
def exp_sigmoid(
|
|
408
416
|
input: torch.Tensor, exponent: float = 10.0, max_value: float = 2.0, threshold: float = 1e-7
|
|
409
417
|
) -> torch.Tensor:
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import math
|
|
2
2
|
from typing import Optional, Tuple, Union
|
|
3
|
+
from torchaudio._internal.module_utils import dropping_support
|
|
3
4
|
|
|
4
5
|
import torch
|
|
5
6
|
import torchaudio
|
|
@@ -176,6 +177,7 @@ def _validate_inputs(
|
|
|
176
177
|
raise ValueError(f"`mic_array` must be a 2D Tensor with shape (num_channels, 3). Found {mic_array.shape}.")
|
|
177
178
|
|
|
178
179
|
|
|
180
|
+
@dropping_support
|
|
179
181
|
def simulate_rir_ism(
|
|
180
182
|
room: torch.Tensor,
|
|
181
183
|
source: torch.Tensor,
|
|
@@ -276,6 +278,7 @@ def simulate_rir_ism(
|
|
|
276
278
|
return rir
|
|
277
279
|
|
|
278
280
|
|
|
281
|
+
@dropping_support
|
|
279
282
|
def ray_tracing(
|
|
280
283
|
room: torch.Tensor,
|
|
281
284
|
source: torch.Tensor,
|
|
@@ -4,6 +4,7 @@ from typing import Optional
|
|
|
4
4
|
|
|
5
5
|
import torch
|
|
6
6
|
from torchaudio.functional.functional import _create_triangular_filterbank
|
|
7
|
+
from torchaudio._internal.module_utils import dropping_support
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
def _hz_to_bark(freqs: float, bark_scale: str = "traunmuller") -> float:
|
|
@@ -72,6 +73,7 @@ def _hz_to_octs(freqs, tuning=0.0, bins_per_octave=12):
|
|
|
72
73
|
return torch.log2(freqs / (a440 / 16))
|
|
73
74
|
|
|
74
75
|
|
|
76
|
+
@dropping_support
|
|
75
77
|
def barkscale_fbanks(
|
|
76
78
|
n_freqs: int,
|
|
77
79
|
f_min: float,
|
|
@@ -129,6 +131,7 @@ def barkscale_fbanks(
|
|
|
129
131
|
return fb
|
|
130
132
|
|
|
131
133
|
|
|
134
|
+
@dropping_support
|
|
132
135
|
def chroma_filterbank(
|
|
133
136
|
sample_rate: int,
|
|
134
137
|
n_freqs: int,
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from torchaudio._internal.module_utils import dropping_const_support
|
|
1
2
|
from ._conformer_wav2vec2 import (
|
|
2
3
|
conformer_wav2vec2_base,
|
|
3
4
|
conformer_wav2vec2_model,
|
|
@@ -10,7 +11,9 @@ from ._emformer_hubert import emformer_hubert_base, emformer_hubert_model
|
|
|
10
11
|
from .conv_emformer import ConvEmformer
|
|
11
12
|
from .hifi_gan import hifigan_vocoder, hifigan_vocoder_v1, hifigan_vocoder_v2, hifigan_vocoder_v3, HiFiGANVocoder
|
|
12
13
|
from .rnnt import conformer_rnnt_base, conformer_rnnt_biasing, conformer_rnnt_biasing_base, conformer_rnnt_model
|
|
13
|
-
from .rnnt_decoder import Hypothesis, RNNTBeamSearchBiasing
|
|
14
|
+
from .rnnt_decoder import Hypothesis as _Hypothesis, RNNTBeamSearchBiasing
|
|
15
|
+
|
|
16
|
+
Hypothesis = dropping_const_support(_Hypothesis, name="Hypothesis")
|
|
14
17
|
|
|
15
18
|
__all__ = [
|
|
16
19
|
"conformer_rnnt_base",
|
|
@@ -7,6 +7,7 @@ from torchaudio.models import Wav2Vec2Model
|
|
|
7
7
|
from torchaudio.models.conformer import ConformerLayer
|
|
8
8
|
from torchaudio.models.rnnt import _TimeReduction
|
|
9
9
|
from torchaudio.models.wav2vec2 import components
|
|
10
|
+
from torchaudio._internal.module_utils import dropping_class_support, dropping_support
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
def _buffered_arange(max) -> Tensor:
|
|
@@ -252,6 +253,7 @@ class ConformerEncoder(Module):
|
|
|
252
253
|
return self._get_intermediate_outputs(x, mask=masks, num_layers=num_layers)
|
|
253
254
|
|
|
254
255
|
|
|
256
|
+
@dropping_class_support
|
|
255
257
|
class ConformerWav2Vec2PretrainModel(Module):
|
|
256
258
|
"""Conformer Wav2Vec2 pre-train model for training from scratch.
|
|
257
259
|
|
|
@@ -437,6 +439,7 @@ def _get_conformer_negativer_sampler(
|
|
|
437
439
|
return NegativeSampler(preprocessor, num_negatives, cross_sample_negatives)
|
|
438
440
|
|
|
439
441
|
|
|
442
|
+
@dropping_support
|
|
440
443
|
def conformer_wav2vec2_model(
|
|
441
444
|
extractor_input_dim: int,
|
|
442
445
|
extractor_output_dim: int,
|
|
@@ -501,6 +504,7 @@ def conformer_wav2vec2_model(
|
|
|
501
504
|
return Wav2Vec2Model(feature_extractor, encoder)
|
|
502
505
|
|
|
503
506
|
|
|
507
|
+
@dropping_support
|
|
504
508
|
def conformer_wav2vec2_base(
|
|
505
509
|
extractor_input_dim: int = 64,
|
|
506
510
|
extractor_output_dim: int = 256,
|
|
@@ -536,6 +540,7 @@ def conformer_wav2vec2_base(
|
|
|
536
540
|
)
|
|
537
541
|
|
|
538
542
|
|
|
543
|
+
@dropping_support
|
|
539
544
|
def conformer_wav2vec2_pretrain_model(
|
|
540
545
|
extractor_input_dim: int,
|
|
541
546
|
extractor_output_dim: int,
|
|
@@ -672,6 +677,7 @@ def conformer_wav2vec2_pretrain_model(
|
|
|
672
677
|
)
|
|
673
678
|
|
|
674
679
|
|
|
680
|
+
@dropping_support
|
|
675
681
|
def conformer_wav2vec2_pretrain_base(
|
|
676
682
|
extractor_input_dim: int = 64,
|
|
677
683
|
extractor_output_dim: int = 256,
|
|
@@ -733,6 +739,7 @@ def conformer_wav2vec2_pretrain_base(
|
|
|
733
739
|
)
|
|
734
740
|
|
|
735
741
|
|
|
742
|
+
@dropping_support
|
|
736
743
|
def conformer_wav2vec2_pretrain_large(
|
|
737
744
|
extractor_input_dim: int = 64,
|
|
738
745
|
extractor_output_dim: int = 256,
|
|
@@ -4,6 +4,8 @@ import torch
|
|
|
4
4
|
from torchaudio.models import Wav2Vec2Model
|
|
5
5
|
from torchaudio.models.emformer import Emformer
|
|
6
6
|
from torchaudio.models.rnnt import _TimeReduction
|
|
7
|
+
from torchaudio._internal.module_utils import dropping_support
|
|
8
|
+
|
|
7
9
|
|
|
8
10
|
|
|
9
11
|
class FeatureEncoder(torch.nn.Module):
|
|
@@ -217,6 +219,7 @@ def _get_emformer_encoder(
|
|
|
217
219
|
return EmformerEncoder(emformer, output_linear, layer_norm)
|
|
218
220
|
|
|
219
221
|
|
|
222
|
+
@dropping_support
|
|
220
223
|
def emformer_hubert_model(
|
|
221
224
|
extractor_input_dim: int,
|
|
222
225
|
extractor_output_dim: int,
|
|
@@ -292,6 +295,7 @@ def emformer_hubert_model(
|
|
|
292
295
|
return Wav2Vec2Model(feature_extractor, emformer, aux)
|
|
293
296
|
|
|
294
297
|
|
|
298
|
+
@dropping_support
|
|
295
299
|
def emformer_hubert_base(
|
|
296
300
|
extractor_input_dim: int = 80,
|
|
297
301
|
extractor_output_dim: int = 128,
|
|
@@ -3,6 +3,8 @@ from typing import List, Optional, Tuple
|
|
|
3
3
|
|
|
4
4
|
import torch
|
|
5
5
|
from torchaudio.models.emformer import _EmformerAttention, _EmformerImpl, _get_weight_init_gains
|
|
6
|
+
from torchaudio._internal.module_utils import dropping_class_support, dropping_support
|
|
7
|
+
|
|
6
8
|
|
|
7
9
|
|
|
8
10
|
def _get_activation_module(activation: str) -> torch.nn.Module:
|
|
@@ -441,6 +443,7 @@ class _ConvEmformerLayer(torch.nn.Module):
|
|
|
441
443
|
return output_utterance, output_right_context, output_state, next_m
|
|
442
444
|
|
|
443
445
|
|
|
446
|
+
@dropping_class_support
|
|
444
447
|
class ConvEmformer(_EmformerImpl):
|
|
445
448
|
r"""Implements the convolution-augmented streaming transformer architecture introduced in
|
|
446
449
|
*Streaming Transformer Transducer based Speech Recognition Using Non-Causal Convolution*
|
|
@@ -476,6 +479,7 @@ class ConvEmformer(_EmformerImpl):
|
|
|
476
479
|
>>> output, lengths, states = conv_emformer.infer(input, lengths, None)
|
|
477
480
|
"""
|
|
478
481
|
|
|
482
|
+
@dropping_support
|
|
479
483
|
def __init__(
|
|
480
484
|
self,
|
|
481
485
|
input_dim: int,
|
|
@@ -28,8 +28,10 @@ import torch
|
|
|
28
28
|
import torch.nn as nn
|
|
29
29
|
import torch.nn.functional as F
|
|
30
30
|
from torch.nn import Conv1d, ConvTranspose1d
|
|
31
|
+
from torchaudio._internal.module_utils import dropping_class_support, dropping_support
|
|
31
32
|
|
|
32
33
|
|
|
34
|
+
@dropping_class_support
|
|
33
35
|
class HiFiGANVocoder(torch.nn.Module):
|
|
34
36
|
"""Generator part of *HiFi GAN* :cite:`NEURIPS2020_c5d73680`.
|
|
35
37
|
Source: https://github.com/jik876/hifi-gan/blob/4769534d45265d52a904b850da5a622601885777/models.py#L75
|
|
@@ -246,6 +248,7 @@ def get_padding(kernel_size, dilation=1):
|
|
|
246
248
|
return int((kernel_size * dilation - dilation) / 2)
|
|
247
249
|
|
|
248
250
|
|
|
251
|
+
@dropping_support
|
|
249
252
|
def hifigan_vocoder(
|
|
250
253
|
in_channels: int,
|
|
251
254
|
upsample_rates: Tuple[int, ...],
|
|
@@ -282,6 +285,7 @@ def hifigan_vocoder(
|
|
|
282
285
|
)
|
|
283
286
|
|
|
284
287
|
|
|
288
|
+
@dropping_support
|
|
285
289
|
def hifigan_vocoder_v1() -> HiFiGANVocoder:
|
|
286
290
|
r"""Builds HiFiGAN Vocoder with V1 architecture :cite:`NEURIPS2020_c5d73680`.
|
|
287
291
|
|
|
@@ -300,6 +304,7 @@ def hifigan_vocoder_v1() -> HiFiGANVocoder:
|
|
|
300
304
|
)
|
|
301
305
|
|
|
302
306
|
|
|
307
|
+
@dropping_support
|
|
303
308
|
def hifigan_vocoder_v2() -> HiFiGANVocoder:
|
|
304
309
|
r"""Builds HiFiGAN Vocoder with V2 architecture :cite:`NEURIPS2020_c5d73680`.
|
|
305
310
|
|
|
@@ -318,6 +323,7 @@ def hifigan_vocoder_v2() -> HiFiGANVocoder:
|
|
|
318
323
|
)
|
|
319
324
|
|
|
320
325
|
|
|
326
|
+
@dropping_support
|
|
321
327
|
def hifigan_vocoder_v3() -> HiFiGANVocoder:
|
|
322
328
|
r"""Builds HiFiGAN Vocoder with V3 architecture :cite:`NEURIPS2020_c5d73680`.
|
|
323
329
|
|
|
@@ -5,6 +5,8 @@ import torch
|
|
|
5
5
|
from torchaudio.models import Conformer, RNNT
|
|
6
6
|
from torchaudio.models.rnnt import _Joiner, _Predictor, _TimeReduction, _Transcriber
|
|
7
7
|
|
|
8
|
+
from torchaudio._internal.module_utils import dropping_support
|
|
9
|
+
|
|
8
10
|
|
|
9
11
|
TrieNode = Tuple[Dict[int, "TrieNode"], int, Optional[Tuple[int, int]]]
|
|
10
12
|
|
|
@@ -472,6 +474,7 @@ class RNNTBiasing(RNNT):
|
|
|
472
474
|
return output, source_lengths, jointer_activation
|
|
473
475
|
|
|
474
476
|
|
|
477
|
+
@dropping_support
|
|
475
478
|
def conformer_rnnt_model(
|
|
476
479
|
*,
|
|
477
480
|
input_dim: int,
|
|
@@ -544,6 +547,7 @@ def conformer_rnnt_model(
|
|
|
544
547
|
return RNNT(encoder, predictor, joiner)
|
|
545
548
|
|
|
546
549
|
|
|
550
|
+
@dropping_support
|
|
547
551
|
def conformer_rnnt_base() -> RNNT:
|
|
548
552
|
r"""Builds basic version of Conformer RNN-T model.
|
|
549
553
|
|
|
@@ -572,6 +576,7 @@ def conformer_rnnt_base() -> RNNT:
|
|
|
572
576
|
)
|
|
573
577
|
|
|
574
578
|
|
|
579
|
+
@dropping_support
|
|
575
580
|
def conformer_rnnt_biasing(
|
|
576
581
|
*,
|
|
577
582
|
input_dim: int,
|
|
@@ -677,6 +682,7 @@ def conformer_rnnt_biasing(
|
|
|
677
682
|
)
|
|
678
683
|
|
|
679
684
|
|
|
685
|
+
@dropping_support
|
|
680
686
|
def conformer_rnnt_biasing_base(charlist=None, biasing=True) -> RNNT:
|
|
681
687
|
r"""Builds basic version of Conformer RNN-T model with TCPGen.
|
|
682
688
|
|
|
@@ -4,6 +4,8 @@ import torch
|
|
|
4
4
|
from torchaudio.models import RNNT
|
|
5
5
|
from torchaudio.prototype.models.rnnt import TrieNode
|
|
6
6
|
|
|
7
|
+
from torchaudio._internal.module_utils import dropping_class_support
|
|
8
|
+
|
|
7
9
|
__all__ = ["Hypothesis", "RNNTBeamSearchBiasing"]
|
|
8
10
|
|
|
9
11
|
|
|
@@ -80,6 +82,7 @@ def _remove_hypo(hypo: Hypothesis, hypo_list: List[Hypothesis]) -> None:
|
|
|
80
82
|
break
|
|
81
83
|
|
|
82
84
|
|
|
85
|
+
@dropping_class_support
|
|
83
86
|
class RNNTBeamSearchBiasing(torch.nn.Module):
|
|
84
87
|
r"""Beam search decoder for RNN-T model with biasing support.
|
|
85
88
|
|
|
@@ -1,6 +1,15 @@
|
|
|
1
1
|
from ._vggish import VGGISH, VGGishBundle
|
|
2
|
-
from .hifigan_pipeline import HIFIGAN_VOCODER_V3_LJSPEECH, HiFiGANVocoderBundle
|
|
3
|
-
from .rnnt_pipeline import
|
|
2
|
+
from .hifigan_pipeline import HIFIGAN_VOCODER_V3_LJSPEECH as _HIFIGAN_VOCODER_V3_LJSPEECH, HiFiGANVocoderBundle
|
|
3
|
+
from .rnnt_pipeline import (
|
|
4
|
+
EMFORMER_RNNT_BASE_MUSTC as _EMFORMER_RNNT_BASE_MUSTC,
|
|
5
|
+
EMFORMER_RNNT_BASE_TEDLIUM3 as _EMFORMER_RNNT_BASE_TEDLIUM3
|
|
6
|
+
)
|
|
7
|
+
from torchaudio._internal.module_utils import dropping_const_support
|
|
8
|
+
|
|
9
|
+
EMFORMER_RNNT_BASE_MUSTC = dropping_const_support(_EMFORMER_RNNT_BASE_MUSTC)
|
|
10
|
+
EMFORMER_RNNT_BASE_TEDLIUM3 = dropping_const_support(_EMFORMER_RNNT_BASE_TEDLIUM3)
|
|
11
|
+
HIFIGAN_VOCODER_V3_LJSPEECH = dropping_const_support(_HIFIGAN_VOCODER_V3_LJSPEECH)
|
|
12
|
+
|
|
4
13
|
|
|
5
14
|
__all__ = [
|
|
6
15
|
"EMFORMER_RNNT_BASE_MUSTC",
|
|
@@ -1,3 +1,7 @@
|
|
|
1
|
-
from ._vggish_pipeline import VGGISH, VGGishBundle
|
|
1
|
+
from ._vggish_pipeline import VGGISH as _VGGISH, VGGishBundle
|
|
2
|
+
from torchaudio._internal.module_utils import dropping_const_support
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
VGGISH = dropping_const_support(_VGGISH, "VGGISH")
|
|
2
6
|
|
|
3
7
|
__all__ = ["VGGISH", "VGGishBundle"]
|
|
@@ -18,6 +18,8 @@ import math
|
|
|
18
18
|
|
|
19
19
|
import torch
|
|
20
20
|
|
|
21
|
+
from torchaudio._internal.module_utils import dropping_class_support
|
|
22
|
+
|
|
21
23
|
|
|
22
24
|
_MEL_BREAK_FREQUENCY_HERTZ = 700.0
|
|
23
25
|
_MEL_HIGH_FREQUENCY_Q = 1127.0
|
|
@@ -191,6 +193,7 @@ def _waveform_to_examples(data):
|
|
|
191
193
|
return log_mel_examples.unsqueeze(1)
|
|
192
194
|
|
|
193
195
|
|
|
196
|
+
@dropping_class_support
|
|
194
197
|
class VGGish(torch.nn.Module):
|
|
195
198
|
"""Implementation of VGGish model :cite:`45611`."""
|
|
196
199
|
|
|
@@ -215,7 +218,7 @@ class VGGish(torch.nn.Module):
|
|
|
215
218
|
|
|
216
219
|
return self.embedding_network(x)
|
|
217
220
|
|
|
218
|
-
|
|
221
|
+
@dropping_class_support
|
|
219
222
|
class VGGishInputProcessor:
|
|
220
223
|
"""Converts raw waveforms to batches of examples to use as inputs to VGGish."""
|
|
221
224
|
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
2
|
from typing import Callable, Dict
|
|
3
3
|
|
|
4
|
-
import
|
|
5
|
-
|
|
4
|
+
from torchaudio._internal.module_utils import dropping_class_support
|
|
5
|
+
|
|
6
6
|
|
|
7
7
|
from ._vggish_impl import _SAMPLE_RATE, VGGish as _VGGish, VGGishInputProcessor as _VGGishInputProcessor
|
|
8
8
|
|
|
@@ -12,6 +12,7 @@ def _get_state_dict():
|
|
|
12
12
|
return torch.load(path)
|
|
13
13
|
|
|
14
14
|
|
|
15
|
+
@dropping_class_support
|
|
15
16
|
@dataclass
|
|
16
17
|
class VGGishBundle:
|
|
17
18
|
"""VGGish :cite:`45611` inference pipeline ported from
|
|
@@ -9,7 +9,10 @@ from torchaudio._internal import load_state_dict_from_url
|
|
|
9
9
|
from torchaudio.prototype.models.hifi_gan import hifigan_vocoder, HiFiGANVocoder
|
|
10
10
|
from torchaudio.transforms import MelSpectrogram
|
|
11
11
|
|
|
12
|
+
from torchaudio._internal.module_utils import dropping_support, dropping_class_support
|
|
12
13
|
|
|
14
|
+
|
|
15
|
+
@dropping_class_support
|
|
13
16
|
@dataclass
|
|
14
17
|
class HiFiGANVocoderBundle:
|
|
15
18
|
"""Data class that bundles associated information to use pretrained
|
|
@@ -82,6 +85,7 @@ class HiFiGANVocoderBundle:
|
|
|
82
85
|
state_dict = load_state_dict_from_url(url, **dl_kwargs)
|
|
83
86
|
return state_dict
|
|
84
87
|
|
|
88
|
+
@dropping_support
|
|
85
89
|
def get_vocoder(self, *, dl_kwargs=None) -> HiFiGANVocoder:
|
|
86
90
|
"""Construct the HiFiGAN Generator model, which can be used a vocoder, and load the pretrained weight.
|
|
87
91
|
|
|
@@ -99,6 +103,7 @@ class HiFiGANVocoderBundle:
|
|
|
99
103
|
model.eval()
|
|
100
104
|
return model
|
|
101
105
|
|
|
106
|
+
@dropping_support
|
|
102
107
|
def get_mel_transform(self) -> Module:
|
|
103
108
|
"""Construct an object which transforms waveforms into mel spectrograms."""
|
|
104
109
|
return _HiFiGANMelSpectrogram(
|
|
@@ -3,8 +3,9 @@ from typing import Callable, Optional
|
|
|
3
3
|
import torch
|
|
4
4
|
from torchaudio.prototype.functional import barkscale_fbanks, chroma_filterbank
|
|
5
5
|
from torchaudio.transforms import Spectrogram
|
|
6
|
+
from torchaudio._internal.module_utils import dropping_support, dropping_class_support
|
|
6
7
|
|
|
7
|
-
|
|
8
|
+
@dropping_class_support
|
|
8
9
|
class BarkScale(torch.nn.Module):
|
|
9
10
|
r"""Turn a normal STFT into a bark frequency STFT with triangular filter banks.
|
|
10
11
|
|
|
@@ -72,6 +73,7 @@ class BarkScale(torch.nn.Module):
|
|
|
72
73
|
return bark_specgram
|
|
73
74
|
|
|
74
75
|
|
|
76
|
+
@dropping_class_support
|
|
75
77
|
class InverseBarkScale(torch.nn.Module):
|
|
76
78
|
r"""Estimate a STFT in normal frequency domain from bark frequency domain.
|
|
77
79
|
|
|
@@ -188,6 +190,7 @@ class InverseBarkScale(torch.nn.Module):
|
|
|
188
190
|
return specgram
|
|
189
191
|
|
|
190
192
|
|
|
193
|
+
@dropping_class_support
|
|
191
194
|
class BarkSpectrogram(torch.nn.Module):
|
|
192
195
|
r"""Create BarkSpectrogram for a raw audio signal.
|
|
193
196
|
|
|
@@ -297,6 +300,7 @@ class BarkSpectrogram(torch.nn.Module):
|
|
|
297
300
|
return bark_specgram
|
|
298
301
|
|
|
299
302
|
|
|
303
|
+
@dropping_class_support
|
|
300
304
|
class ChromaScale(torch.nn.Module):
|
|
301
305
|
r"""Converts spectrogram to chromagram.
|
|
302
306
|
|
|
@@ -356,6 +360,7 @@ class ChromaScale(torch.nn.Module):
|
|
|
356
360
|
return torch.matmul(x.transpose(-1, -2), self.fb).transpose(-1, -2)
|
|
357
361
|
|
|
358
362
|
|
|
363
|
+
@dropping_class_support
|
|
359
364
|
class ChromaSpectrogram(torch.nn.Module):
|
|
360
365
|
r"""Generates chromagram for audio signal.
|
|
361
366
|
|
|
@@ -3,7 +3,7 @@ from typing import List, Optional, Tuple
|
|
|
3
3
|
|
|
4
4
|
import torch
|
|
5
5
|
import torchaudio
|
|
6
|
-
from torchaudio._internal.module_utils import deprecated
|
|
6
|
+
from torchaudio._internal.module_utils import deprecated, dropping_support
|
|
7
7
|
from torchaudio.utils.sox_utils import list_effects
|
|
8
8
|
|
|
9
9
|
|
|
@@ -39,6 +39,7 @@ def shutdown_sox_effects():
|
|
|
39
39
|
pass
|
|
40
40
|
|
|
41
41
|
|
|
42
|
+
@dropping_support
|
|
42
43
|
def effect_names() -> List[str]:
|
|
43
44
|
"""Gets list of valid sox effect names
|
|
44
45
|
|
|
@@ -52,6 +53,7 @@ def effect_names() -> List[str]:
|
|
|
52
53
|
return list(list_effects().keys())
|
|
53
54
|
|
|
54
55
|
|
|
56
|
+
@dropping_support
|
|
55
57
|
def apply_effects_tensor(
|
|
56
58
|
tensor: torch.Tensor,
|
|
57
59
|
sample_rate: int,
|
|
@@ -156,6 +158,7 @@ def apply_effects_tensor(
|
|
|
156
158
|
return sox_ext.apply_effects_tensor(tensor, sample_rate, effects, channels_first)
|
|
157
159
|
|
|
158
160
|
|
|
161
|
+
@dropping_support
|
|
159
162
|
def apply_effects_file(
|
|
160
163
|
path: str,
|
|
161
164
|
effects: List[List[str]],
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from torchaudio._internal.module_utils import dropping_class_support
|
|
1
2
|
from ._multi_channel import MVDR, PSD, RTFMVDR, SoudenMVDR
|
|
2
3
|
from ._transforms import (
|
|
3
4
|
AddNoise,
|
|
@@ -21,7 +22,7 @@ from ._transforms import (
|
|
|
21
22
|
PitchShift,
|
|
22
23
|
Preemphasis,
|
|
23
24
|
Resample,
|
|
24
|
-
RNNTLoss,
|
|
25
|
+
RNNTLoss as _RNNTLoss,
|
|
25
26
|
SlidingWindowCmn,
|
|
26
27
|
SpecAugment,
|
|
27
28
|
SpectralCentroid,
|
|
@@ -34,6 +35,7 @@ from ._transforms import (
|
|
|
34
35
|
Vol,
|
|
35
36
|
)
|
|
36
37
|
|
|
38
|
+
RNNTLoss = dropping_class_support(_RNNTLoss)
|
|
37
39
|
|
|
38
40
|
__all__ = [
|
|
39
41
|
"AddNoise",
|
|
@@ -10,6 +10,7 @@ from torch.nn.modules.lazy import LazyModuleMixin
|
|
|
10
10
|
from torch.nn.parameter import UninitializedParameter
|
|
11
11
|
|
|
12
12
|
from torchaudio import functional as F
|
|
13
|
+
from torchaudio.functional.functional import _rnnt_loss
|
|
13
14
|
from torchaudio.functional.functional import (
|
|
14
15
|
_apply_sinc_resample_kernel,
|
|
15
16
|
_check_convolve_mode,
|
|
@@ -1846,7 +1847,7 @@ class RNNTLoss(torch.nn.Module):
|
|
|
1846
1847
|
Tensor: Loss with the reduction option applied. If ``reduction`` is ``"none"``, then size (batch),
|
|
1847
1848
|
otherwise scalar.
|
|
1848
1849
|
"""
|
|
1849
|
-
return
|
|
1850
|
+
return _rnnt_loss(
|
|
1850
1851
|
logits,
|
|
1851
1852
|
targets,
|
|
1852
1853
|
logit_lengths,
|
|
@@ -2134,4 +2135,4 @@ class Deemphasis(torch.nn.Module):
|
|
|
2134
2135
|
Returns:
|
|
2135
2136
|
torch.Tensor: De-emphasized waveform, with shape `(..., N)`.
|
|
2136
2137
|
"""
|
|
2137
|
-
return F.deemphasis(waveform, coeff=self.coeff)
|
|
2138
|
+
return F.functional.deemphasis(waveform, coeff=self.coeff)
|