torchaudio 2.8.0__cp312-cp312-win_amd64.whl → 2.9.0__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchaudio might be problematic. Click here for more details.
- torchaudio/__init__.py +179 -39
- torchaudio/_extension/__init__.py +1 -14
- torchaudio/_extension/utils.py +0 -47
- torchaudio/_internal/module_utils.py +12 -3
- torchaudio/_torchcodec.py +73 -85
- torchaudio/datasets/cmuarctic.py +1 -1
- torchaudio/datasets/utils.py +1 -1
- torchaudio/functional/__init__.py +0 -2
- torchaudio/functional/_alignment.py +1 -1
- torchaudio/functional/filtering.py +70 -55
- torchaudio/functional/functional.py +26 -60
- torchaudio/lib/_torchaudio.pyd +0 -0
- torchaudio/lib/libtorchaudio.pyd +0 -0
- torchaudio/models/decoder/__init__.py +14 -2
- torchaudio/models/decoder/_ctc_decoder.py +6 -6
- torchaudio/models/decoder/_cuda_ctc_decoder.py +1 -1
- torchaudio/models/squim/objective.py +2 -2
- torchaudio/pipelines/_source_separation_pipeline.py +1 -1
- torchaudio/pipelines/_squim_pipeline.py +2 -2
- torchaudio/pipelines/_tts/utils.py +1 -1
- torchaudio/pipelines/rnnt_pipeline.py +4 -4
- torchaudio/transforms/__init__.py +1 -0
- torchaudio/transforms/_transforms.py +2 -2
- torchaudio/utils/__init__.py +2 -9
- torchaudio/utils/download.py +1 -3
- torchaudio/version.py +2 -2
- {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/METADATA +8 -11
- torchaudio-2.9.0.dist-info/RECORD +85 -0
- {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/top_level.txt +0 -1
- torchaudio/_backend/__init__.py +0 -61
- torchaudio/_backend/backend.py +0 -53
- torchaudio/_backend/common.py +0 -52
- torchaudio/_backend/ffmpeg.py +0 -334
- torchaudio/_backend/soundfile.py +0 -54
- torchaudio/_backend/soundfile_backend.py +0 -457
- torchaudio/_backend/sox.py +0 -91
- torchaudio/_backend/utils.py +0 -350
- torchaudio/backend/__init__.py +0 -8
- torchaudio/backend/_no_backend.py +0 -25
- torchaudio/backend/_sox_io_backend.py +0 -294
- torchaudio/backend/common.py +0 -13
- torchaudio/backend/no_backend.py +0 -14
- torchaudio/backend/soundfile_backend.py +0 -14
- torchaudio/backend/sox_io_backend.py +0 -14
- torchaudio/io/__init__.py +0 -20
- torchaudio/io/_effector.py +0 -347
- torchaudio/io/_playback.py +0 -72
- torchaudio/kaldi_io.py +0 -150
- torchaudio/prototype/__init__.py +0 -0
- torchaudio/prototype/datasets/__init__.py +0 -4
- torchaudio/prototype/datasets/musan.py +0 -68
- torchaudio/prototype/functional/__init__.py +0 -26
- torchaudio/prototype/functional/_dsp.py +0 -441
- torchaudio/prototype/functional/_rir.py +0 -382
- torchaudio/prototype/functional/functional.py +0 -193
- torchaudio/prototype/models/__init__.py +0 -39
- torchaudio/prototype/models/_conformer_wav2vec2.py +0 -801
- torchaudio/prototype/models/_emformer_hubert.py +0 -337
- torchaudio/prototype/models/conv_emformer.py +0 -529
- torchaudio/prototype/models/hifi_gan.py +0 -342
- torchaudio/prototype/models/rnnt.py +0 -717
- torchaudio/prototype/models/rnnt_decoder.py +0 -402
- torchaudio/prototype/pipelines/__init__.py +0 -21
- torchaudio/prototype/pipelines/_vggish/__init__.py +0 -7
- torchaudio/prototype/pipelines/_vggish/_vggish_impl.py +0 -236
- torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py +0 -83
- torchaudio/prototype/pipelines/hifigan_pipeline.py +0 -233
- torchaudio/prototype/pipelines/rnnt_pipeline.py +0 -58
- torchaudio/prototype/transforms/__init__.py +0 -9
- torchaudio/prototype/transforms/_transforms.py +0 -461
- torchaudio/sox_effects/__init__.py +0 -10
- torchaudio/sox_effects/sox_effects.py +0 -275
- torchaudio/utils/ffmpeg_utils.py +0 -11
- torchaudio/utils/sox_utils.py +0 -118
- torchaudio-2.8.0.dist-info/RECORD +0 -145
- torio/__init__.py +0 -8
- torio/_extension/__init__.py +0 -13
- torio/_extension/utils.py +0 -147
- torio/io/__init__.py +0 -9
- torio/io/_streaming_media_decoder.py +0 -977
- torio/io/_streaming_media_encoder.py +0 -502
- torio/lib/__init__.py +0 -0
- torio/lib/_torio_ffmpeg4.pyd +0 -0
- torio/lib/_torio_ffmpeg5.pyd +0 -0
- torio/lib/_torio_ffmpeg6.pyd +0 -0
- torio/lib/libtorio_ffmpeg4.pyd +0 -0
- torio/lib/libtorio_ffmpeg5.pyd +0 -0
- torio/lib/libtorio_ffmpeg6.pyd +0 -0
- torio/utils/__init__.py +0 -4
- torio/utils/ffmpeg_utils.py +0 -275
- {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/WHEEL +0 -0
- {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,83 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
from typing import Callable, Dict
|
|
3
|
-
|
|
4
|
-
from torchaudio._internal.module_utils import dropping_class_support
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
from ._vggish_impl import _SAMPLE_RATE, VGGish as _VGGish, VGGishInputProcessor as _VGGishInputProcessor
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def _get_state_dict():
|
|
11
|
-
path = torchaudio.utils.download_asset("models/vggish.pt")
|
|
12
|
-
return torch.load(path)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
@dropping_class_support
|
|
16
|
-
@dataclass
|
|
17
|
-
class VGGishBundle:
|
|
18
|
-
"""VGGish :cite:`45611` inference pipeline ported from
|
|
19
|
-
`torchvggish <https://github.com/harritaylor/torchvggish>`__
|
|
20
|
-
and `tensorflow-models <https://github.com/tensorflow/models/tree/master/research/audioset>`__.
|
|
21
|
-
|
|
22
|
-
Example:
|
|
23
|
-
>>> import torchaudio
|
|
24
|
-
>>> from torchaudio.prototype.pipelines import VGGISH
|
|
25
|
-
>>>
|
|
26
|
-
>>> input_sr = VGGISH.sample_rate
|
|
27
|
-
>>> input_proc = VGGISH.get_input_processor()
|
|
28
|
-
>>> model = VGGISH.get_model()
|
|
29
|
-
>>>
|
|
30
|
-
>>> waveform, sr = torchaudio.load(
|
|
31
|
-
>>> "Chopin_Ballade_-1_In_G_Minor,_Op._23.mp3",
|
|
32
|
-
>>> )
|
|
33
|
-
>>> waveform = waveform.squeeze(0)
|
|
34
|
-
>>> waveform = torchaudio.functional.resample(waveform, sr, input_sr)
|
|
35
|
-
>>> mono_output = model(input_proc(waveform))
|
|
36
|
-
"""
|
|
37
|
-
|
|
38
|
-
class VGGish(_VGGish):
|
|
39
|
-
__doc__ = _VGGish.__doc__
|
|
40
|
-
|
|
41
|
-
class VGGishInputProcessor(_VGGishInputProcessor):
|
|
42
|
-
__doc__ = _VGGishInputProcessor.__doc__
|
|
43
|
-
|
|
44
|
-
_state_dict_func: Callable[[], Dict]
|
|
45
|
-
|
|
46
|
-
@property
|
|
47
|
-
def sample_rate(self) -> int:
|
|
48
|
-
"""Sample rate of input waveform expected by input processor and model.
|
|
49
|
-
|
|
50
|
-
:type: int
|
|
51
|
-
"""
|
|
52
|
-
return _SAMPLE_RATE
|
|
53
|
-
|
|
54
|
-
def get_model(self) -> VGGish:
|
|
55
|
-
"""Constructs pre-trained VGGish model. Downloads and caches weights as necessary.
|
|
56
|
-
|
|
57
|
-
Returns:
|
|
58
|
-
VGGish: VGGish model with pre-trained weights loaded.
|
|
59
|
-
"""
|
|
60
|
-
model = self.VGGish()
|
|
61
|
-
state_dict = self._state_dict_func()
|
|
62
|
-
model.load_state_dict(state_dict)
|
|
63
|
-
model.eval()
|
|
64
|
-
return model
|
|
65
|
-
|
|
66
|
-
def get_input_processor(self) -> VGGishInputProcessor:
|
|
67
|
-
"""Constructs input processor for VGGish.
|
|
68
|
-
|
|
69
|
-
Returns:
|
|
70
|
-
VGGishInputProcessor: input processor for VGGish.
|
|
71
|
-
"""
|
|
72
|
-
return self.VGGishInputProcessor()
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
VGGISH = VGGishBundle(_get_state_dict)
|
|
76
|
-
VGGISH.__doc__ = """Pre-trained VGGish :cite:`45611` inference pipeline ported from
|
|
77
|
-
`torchvggish <https://github.com/harritaylor/torchvggish>`__
|
|
78
|
-
and `tensorflow-models <https://github.com/tensorflow/models/tree/master/research/audioset>`__.
|
|
79
|
-
|
|
80
|
-
Per the `documentation <https://github.com/tensorflow/models/tree/master/research/audioset/vggish>`__
|
|
81
|
-
for the original model, the model is "trained on a large YouTube dataset (a preliminary version of
|
|
82
|
-
what later became YouTube-8M)".
|
|
83
|
-
"""
|
|
@@ -1,233 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
from typing import Any, Dict, Optional
|
|
3
|
-
|
|
4
|
-
import torch
|
|
5
|
-
import torch.nn.functional as F
|
|
6
|
-
from torch.nn import Module
|
|
7
|
-
from torchaudio._internal import load_state_dict_from_url
|
|
8
|
-
|
|
9
|
-
from torchaudio.prototype.models.hifi_gan import hifigan_vocoder, HiFiGANVocoder
|
|
10
|
-
from torchaudio.transforms import MelSpectrogram
|
|
11
|
-
|
|
12
|
-
from torchaudio._internal.module_utils import dropping_support, dropping_class_support
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
@dropping_class_support
|
|
16
|
-
@dataclass
|
|
17
|
-
class HiFiGANVocoderBundle:
|
|
18
|
-
"""Data class that bundles associated information to use pretrained
|
|
19
|
-
:py:class:`~torchaudio.prototype.models.HiFiGANVocoder`.
|
|
20
|
-
|
|
21
|
-
This class provides interfaces for instantiating the pretrained model along with
|
|
22
|
-
the information necessary to retrieve pretrained weights and additional data
|
|
23
|
-
to be used with the model.
|
|
24
|
-
|
|
25
|
-
Torchaudio library instantiates objects of this class, each of which represents
|
|
26
|
-
a different pretrained model. Client code should access pretrained models via these
|
|
27
|
-
instances.
|
|
28
|
-
|
|
29
|
-
This bundle can convert mel spectrorgam to waveforms and vice versa. A typical use case would be a flow like
|
|
30
|
-
`text -> mel spectrogram -> waveform`, where one can use an external component, e.g. Tacotron2,
|
|
31
|
-
to generate mel spectrogram from text. Please see below for the code example.
|
|
32
|
-
|
|
33
|
-
Example: Transform synthetic mel spectrogram to audio.
|
|
34
|
-
>>> import torch
|
|
35
|
-
>>> import torchaudio
|
|
36
|
-
>>> # Since HiFiGAN bundle is in prototypes, it needs to be exported explicitly
|
|
37
|
-
>>> from torchaudio.prototype.pipelines import HIFIGAN_VOCODER_V3_LJSPEECH as bundle
|
|
38
|
-
>>>
|
|
39
|
-
>>> # Load the HiFiGAN bundle
|
|
40
|
-
>>> vocoder = bundle.get_vocoder()
|
|
41
|
-
Downloading: "https://download.pytorch.org/torchaudio/models/hifigan_vocoder_v3_ljspeech.pth"
|
|
42
|
-
100%|████████████| 5.59M/5.59M [00:00<00:00, 18.7MB/s]
|
|
43
|
-
>>>
|
|
44
|
-
>>> # Generate synthetic mel spectrogram
|
|
45
|
-
>>> specgram = torch.sin(0.5 * torch.arange(start=0, end=100)).expand(bundle._vocoder_params["in_channels"], 100)
|
|
46
|
-
>>>
|
|
47
|
-
>>> # Transform mel spectrogram into audio
|
|
48
|
-
>>> waveform = vocoder(specgram)
|
|
49
|
-
>>> torchaudio.save('sample.wav', waveform, bundle.sample_rate)
|
|
50
|
-
|
|
51
|
-
Example: Usage together with Tacotron2, text to audio.
|
|
52
|
-
>>> import torch
|
|
53
|
-
>>> import torchaudio
|
|
54
|
-
>>> # Since HiFiGAN bundle is in prototypes, it needs to be exported explicitly
|
|
55
|
-
>>> from torchaudio.prototype.pipelines import HIFIGAN_VOCODER_V3_LJSPEECH as bundle_hifigan
|
|
56
|
-
>>>
|
|
57
|
-
>>> # Load Tacotron2 bundle
|
|
58
|
-
>>> bundle_tactron2 = torchaudio.pipelines.TACOTRON2_WAVERNN_CHAR_LJSPEECH
|
|
59
|
-
>>> processor = bundle_tactron2.get_text_processor()
|
|
60
|
-
>>> tacotron2 = bundle_tactron2.get_tacotron2()
|
|
61
|
-
>>>
|
|
62
|
-
>>> # Use Tacotron2 to convert text to mel spectrogram
|
|
63
|
-
>>> text = "A quick brown fox jumped over a lazy dog"
|
|
64
|
-
>>> input, lengths = processor(text)
|
|
65
|
-
>>> specgram, lengths, _ = tacotron2.infer(input, lengths)
|
|
66
|
-
>>>
|
|
67
|
-
>>> # Load HiFiGAN bundle
|
|
68
|
-
>>> vocoder = bundle_hifigan.get_vocoder()
|
|
69
|
-
Downloading: "https://download.pytorch.org/torchaudio/models/hifigan_vocoder_v3_ljspeech.pth"
|
|
70
|
-
100%|████████████| 5.59M/5.59M [00:03<00:00, 1.55MB/s]
|
|
71
|
-
>>>
|
|
72
|
-
>>> # Use HiFiGAN to convert mel spectrogram to audio
|
|
73
|
-
>>> waveform = vocoder(specgram).squeeze(0)
|
|
74
|
-
>>> torchaudio.save('sample.wav', waveform, bundle_hifigan.sample_rate)
|
|
75
|
-
""" # noqa: E501
|
|
76
|
-
|
|
77
|
-
_path: str
|
|
78
|
-
_vocoder_params: Dict[str, Any] # Vocoder parameters
|
|
79
|
-
_mel_params: Dict[str, Any] # Mel transformation parameters
|
|
80
|
-
_sample_rate: float
|
|
81
|
-
|
|
82
|
-
def _get_state_dict(self, dl_kwargs):
|
|
83
|
-
url = f"https://download.pytorch.org/torchaudio/models/{self._path}"
|
|
84
|
-
dl_kwargs = {} if dl_kwargs is None else dl_kwargs
|
|
85
|
-
state_dict = load_state_dict_from_url(url, **dl_kwargs)
|
|
86
|
-
return state_dict
|
|
87
|
-
|
|
88
|
-
@dropping_support
|
|
89
|
-
def get_vocoder(self, *, dl_kwargs=None) -> HiFiGANVocoder:
|
|
90
|
-
"""Construct the HiFiGAN Generator model, which can be used a vocoder, and load the pretrained weight.
|
|
91
|
-
|
|
92
|
-
The weight file is downloaded from the internet and cached with
|
|
93
|
-
:func:`torch.hub.load_state_dict_from_url`
|
|
94
|
-
|
|
95
|
-
Args:
|
|
96
|
-
dl_kwargs (dictionary of keyword arguments): Passed to :func:`torch.hub.load_state_dict_from_url`.
|
|
97
|
-
|
|
98
|
-
Returns:
|
|
99
|
-
Variation of :py:class:`~torchaudio.prototype.models.HiFiGANVocoder`.
|
|
100
|
-
"""
|
|
101
|
-
model = hifigan_vocoder(**self._vocoder_params)
|
|
102
|
-
model.load_state_dict(self._get_state_dict(dl_kwargs))
|
|
103
|
-
model.eval()
|
|
104
|
-
return model
|
|
105
|
-
|
|
106
|
-
@dropping_support
|
|
107
|
-
def get_mel_transform(self) -> Module:
|
|
108
|
-
"""Construct an object which transforms waveforms into mel spectrograms."""
|
|
109
|
-
return _HiFiGANMelSpectrogram(
|
|
110
|
-
n_mels=self._vocoder_params["in_channels"],
|
|
111
|
-
sample_rate=self._sample_rate,
|
|
112
|
-
**self._mel_params,
|
|
113
|
-
)
|
|
114
|
-
|
|
115
|
-
@property
|
|
116
|
-
def sample_rate(self):
|
|
117
|
-
"""Sample rate of the audio that the model is trained on.
|
|
118
|
-
|
|
119
|
-
:type: float
|
|
120
|
-
"""
|
|
121
|
-
return self._sample_rate
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
class _HiFiGANMelSpectrogram(torch.nn.Module):
|
|
125
|
-
"""
|
|
126
|
-
Generate mel spectrogram in a way equivalent to the original HiFiGAN implementation:
|
|
127
|
-
https://github.com/jik876/hifi-gan/blob/4769534d45265d52a904b850da5a622601885777/meldataset.py#L49-L72
|
|
128
|
-
|
|
129
|
-
This class wraps around :py:class:`torchaudio.transforms.MelSpectrogram`, but performs extra steps to achive
|
|
130
|
-
equivalence with the HiFiGAN implementation.
|
|
131
|
-
|
|
132
|
-
Args:
|
|
133
|
-
hop_size (int): Length of hop between STFT windows.
|
|
134
|
-
n_fft (int): Size of FFT, creates ``n_fft // 2 + 1`` bins.
|
|
135
|
-
win_length (int): Window size.
|
|
136
|
-
f_min (float or None): Minimum frequency.
|
|
137
|
-
f_max (float or None): Maximum frequency.
|
|
138
|
-
sample_rate (int): Sample rate of audio signal.
|
|
139
|
-
n_mels (int): Number of mel filterbanks.
|
|
140
|
-
"""
|
|
141
|
-
|
|
142
|
-
def __init__(
|
|
143
|
-
self,
|
|
144
|
-
hop_size: int,
|
|
145
|
-
n_fft: int,
|
|
146
|
-
win_length: int,
|
|
147
|
-
f_min: Optional[float],
|
|
148
|
-
f_max: Optional[float],
|
|
149
|
-
sample_rate: float,
|
|
150
|
-
n_mels: int,
|
|
151
|
-
):
|
|
152
|
-
super(_HiFiGANMelSpectrogram, self).__init__()
|
|
153
|
-
self.mel_transform = MelSpectrogram(
|
|
154
|
-
sample_rate=sample_rate,
|
|
155
|
-
n_fft=n_fft,
|
|
156
|
-
win_length=win_length,
|
|
157
|
-
hop_length=hop_size,
|
|
158
|
-
f_min=f_min,
|
|
159
|
-
f_max=f_max,
|
|
160
|
-
n_mels=n_mels,
|
|
161
|
-
normalized=False,
|
|
162
|
-
pad=0,
|
|
163
|
-
mel_scale="slaney",
|
|
164
|
-
norm="slaney",
|
|
165
|
-
center=False,
|
|
166
|
-
)
|
|
167
|
-
self.sample_rate = sample_rate
|
|
168
|
-
self.hop_size = hop_size
|
|
169
|
-
self.n_fft = n_fft
|
|
170
|
-
self.win_length = win_length
|
|
171
|
-
self.f_min = f_min
|
|
172
|
-
self.f_max = f_max
|
|
173
|
-
self.n_mels = n_mels
|
|
174
|
-
self.pad_size = int((n_fft - hop_size) / 2)
|
|
175
|
-
|
|
176
|
-
def forward(self, waveform: torch.Tensor) -> torch.Tensor:
|
|
177
|
-
"""Generate mel spectrogram from a waveform. Should have same sample rate as ``self.sample_rate``.
|
|
178
|
-
|
|
179
|
-
Args:
|
|
180
|
-
waveform (Tensor): waveform of shape ``(batch_size, time_length)``.
|
|
181
|
-
Returns:
|
|
182
|
-
Tensor of shape ``(batch_size, n_mel, time_length)``
|
|
183
|
-
"""
|
|
184
|
-
ref_waveform = F.pad(waveform.unsqueeze(1), (self.pad_size, self.pad_size), mode="reflect")
|
|
185
|
-
ref_waveform = ref_waveform.squeeze(1)
|
|
186
|
-
|
|
187
|
-
spectr = (self.mel_transform.spectrogram(ref_waveform) + 1e-9) ** 0.5
|
|
188
|
-
mel_spectrogram = self.mel_transform.mel_scale(spectr)
|
|
189
|
-
mel_spectrogram = torch.log(torch.clamp(mel_spectrogram, min=1e-5))
|
|
190
|
-
return mel_spectrogram
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
HIFIGAN_VOCODER_V3_LJSPEECH = HiFiGANVocoderBundle(
|
|
194
|
-
"hifigan_vocoder_v3_ljspeech.pth",
|
|
195
|
-
_vocoder_params={
|
|
196
|
-
"upsample_rates": (8, 8, 4),
|
|
197
|
-
"upsample_kernel_sizes": (16, 16, 8),
|
|
198
|
-
"upsample_initial_channel": 256,
|
|
199
|
-
"resblock_kernel_sizes": (3, 5, 7),
|
|
200
|
-
"resblock_dilation_sizes": ((1, 2), (2, 6), (3, 12)),
|
|
201
|
-
"resblock_type": 2,
|
|
202
|
-
"in_channels": 80,
|
|
203
|
-
"lrelu_slope": 0.1,
|
|
204
|
-
},
|
|
205
|
-
_mel_params={
|
|
206
|
-
"hop_size": 256,
|
|
207
|
-
"n_fft": 1024,
|
|
208
|
-
"win_length": 1024,
|
|
209
|
-
"f_min": 0,
|
|
210
|
-
"f_max": 8000,
|
|
211
|
-
},
|
|
212
|
-
_sample_rate=22050,
|
|
213
|
-
)
|
|
214
|
-
HIFIGAN_VOCODER_V3_LJSPEECH.__doc__ = """HiFiGAN Vocoder pipeline, trained on *The LJ Speech Dataset*
|
|
215
|
-
:cite:`ljspeech17`.
|
|
216
|
-
|
|
217
|
-
This pipeine can be used with an external component which generates mel spectrograms from text, for example,
|
|
218
|
-
Tacotron2 - see examples in :py:class:`HiFiGANVocoderBundle`.
|
|
219
|
-
Although this works with the existing Tacotron2 bundles, for the best results one needs to retrain Tacotron2
|
|
220
|
-
using the same data preprocessing pipeline which was used for training HiFiGAN. In particular, the original
|
|
221
|
-
HiFiGAN implementation uses a custom method of generating mel spectrograms from waveforms, different from
|
|
222
|
-
:py:class:`torchaudio.transforms.MelSpectrogram`. We reimplemented this transform as
|
|
223
|
-
:py:meth:`HiFiGANVocoderBundle.get_mel_transform`, making sure it is equivalent to the original HiFiGAN code `here
|
|
224
|
-
<https://github.com/jik876/hifi-gan/blob/4769534d45265d52a904b850da5a622601885777/meldataset.py#L49-L72>`_.
|
|
225
|
-
|
|
226
|
-
The underlying vocoder is constructed by
|
|
227
|
-
:py:func:`torchaudio.prototype.models.hifigan_vocoder`. The weights are converted from the ones published
|
|
228
|
-
with the original paper :cite:`NEURIPS2020_c5d73680` under `MIT License
|
|
229
|
-
<https://github.com/jik876/hifi-gan/blob/4769534d45265d52a904b850da5a622601885777/LICENSE>`__. See links to
|
|
230
|
-
pre-trained models on `GitHub <https://github.com/jik876/hifi-gan#pretrained-model>`__.
|
|
231
|
-
|
|
232
|
-
Please refer to :py:class:`HiFiGANVocoderBundle` for usage instructions.
|
|
233
|
-
"""
|
|
@@ -1,58 +0,0 @@
|
|
|
1
|
-
from functools import partial
|
|
2
|
-
|
|
3
|
-
from torchaudio.models import emformer_rnnt_base
|
|
4
|
-
from torchaudio.pipelines import RNNTBundle
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
EMFORMER_RNNT_BASE_MUSTC = RNNTBundle(
|
|
8
|
-
_rnnt_path="models/emformer_rnnt_base_mustc.pt",
|
|
9
|
-
_rnnt_factory_func=partial(emformer_rnnt_base, num_symbols=501),
|
|
10
|
-
_global_stats_path="pipeline-assets/global_stats_rnnt_mustc.json",
|
|
11
|
-
_sp_model_path="pipeline-assets/spm_bpe_500_mustc.model",
|
|
12
|
-
_right_padding=4,
|
|
13
|
-
_blank=500,
|
|
14
|
-
_sample_rate=16000,
|
|
15
|
-
_n_fft=400,
|
|
16
|
-
_n_mels=80,
|
|
17
|
-
_hop_length=160,
|
|
18
|
-
_segment_length=16,
|
|
19
|
-
_right_context_length=4,
|
|
20
|
-
)
|
|
21
|
-
EMFORMER_RNNT_BASE_MUSTC.__doc__ = """Pre-trained Emformer-RNNT-based ASR pipeline capable of performing both
|
|
22
|
-
streaming and non-streaming inference.
|
|
23
|
-
|
|
24
|
-
The underlying model is constructed by :py:func:`torchaudio.models.emformer_rnnt_base`
|
|
25
|
-
and utilizes weights trained on *MuST-C release v2.0* :cite:`CATTONI2021101155` dataset
|
|
26
|
-
using training script ``train.py``
|
|
27
|
-
`here <https://github.com/pytorch/audio/tree/main/examples/asr/emformer_rnnt>`__
|
|
28
|
-
with ``num_symbols=501``.
|
|
29
|
-
|
|
30
|
-
Please refer to :py:class:`torchaudio.pipelines.RNNTBundle` for usage instructions.
|
|
31
|
-
"""
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
EMFORMER_RNNT_BASE_TEDLIUM3 = RNNTBundle(
|
|
35
|
-
_rnnt_path="models/emformer_rnnt_base_tedlium3.pt",
|
|
36
|
-
_rnnt_factory_func=partial(emformer_rnnt_base, num_symbols=501),
|
|
37
|
-
_global_stats_path="pipeline-assets/global_stats_rnnt_tedlium3.json",
|
|
38
|
-
_sp_model_path="pipeline-assets/spm_bpe_500_tedlium3.model",
|
|
39
|
-
_right_padding=4,
|
|
40
|
-
_blank=500,
|
|
41
|
-
_sample_rate=16000,
|
|
42
|
-
_n_fft=400,
|
|
43
|
-
_n_mels=80,
|
|
44
|
-
_hop_length=160,
|
|
45
|
-
_segment_length=16,
|
|
46
|
-
_right_context_length=4,
|
|
47
|
-
)
|
|
48
|
-
EMFORMER_RNNT_BASE_TEDLIUM3.__doc__ = """Pre-trained Emformer-RNNT-based ASR pipeline capable of performing both
|
|
49
|
-
streaming and non-streaming inference.
|
|
50
|
-
|
|
51
|
-
The underlying model is constructed by :py:func:`torchaudio.models.emformer_rnnt_base`
|
|
52
|
-
and utilizes weights trained on *TED-LIUM Release 3* :cite:`rousseau2012tedlium` dataset
|
|
53
|
-
using training script ``train.py``
|
|
54
|
-
`here <https://github.com/pytorch/audio/tree/main/examples/asr/emformer_rnnt>`__
|
|
55
|
-
with ``num_symbols=501``.
|
|
56
|
-
|
|
57
|
-
Please refer to :py:class:`torchaudio.pipelines.RNNTBundle` for usage instructions.
|
|
58
|
-
"""
|