torchaudio 2.7.1__cp312-cp312-win_amd64.whl → 2.8.0__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchaudio might be problematic. Click here for more details.

Files changed (52) hide show
  1. torchaudio/__init__.py +16 -5
  2. torchaudio/_backend/sox.py +2 -2
  3. torchaudio/_backend/utils.py +33 -0
  4. torchaudio/_internal/module_utils.py +59 -10
  5. torchaudio/_torchcodec.py +352 -0
  6. torchaudio/backend/no_backend.py +2 -2
  7. torchaudio/backend/soundfile_backend.py +2 -2
  8. torchaudio/backend/sox_io_backend.py +2 -2
  9. torchaudio/functional/__init__.py +6 -1
  10. torchaudio/functional/functional.py +7 -3
  11. torchaudio/io/__init__.py +10 -3
  12. torchaudio/kaldi_io.py +6 -0
  13. torchaudio/lib/_torchaudio.pyd +0 -0
  14. torchaudio/lib/libtorchaudio.pyd +0 -0
  15. torchaudio/models/decoder/__init__.py +7 -1
  16. torchaudio/pipelines/_tts/utils.py +3 -1
  17. torchaudio/prototype/datasets/musan.py +2 -1
  18. torchaudio/prototype/functional/_dsp.py +8 -0
  19. torchaudio/prototype/functional/_rir.py +3 -0
  20. torchaudio/prototype/functional/functional.py +3 -0
  21. torchaudio/prototype/models/__init__.py +4 -1
  22. torchaudio/prototype/models/_conformer_wav2vec2.py +7 -0
  23. torchaudio/prototype/models/_emformer_hubert.py +4 -0
  24. torchaudio/prototype/models/conv_emformer.py +4 -0
  25. torchaudio/prototype/models/hifi_gan.py +6 -0
  26. torchaudio/prototype/models/rnnt.py +6 -0
  27. torchaudio/prototype/models/rnnt_decoder.py +3 -0
  28. torchaudio/prototype/pipelines/__init__.py +11 -2
  29. torchaudio/prototype/pipelines/_vggish/__init__.py +5 -1
  30. torchaudio/prototype/pipelines/_vggish/_vggish_impl.py +4 -1
  31. torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py +3 -2
  32. torchaudio/prototype/pipelines/hifigan_pipeline.py +5 -0
  33. torchaudio/prototype/transforms/_transforms.py +6 -1
  34. torchaudio/sox_effects/sox_effects.py +4 -1
  35. torchaudio/transforms/__init__.py +3 -1
  36. torchaudio/transforms/_transforms.py +3 -2
  37. torchaudio/utils/download.py +2 -0
  38. torchaudio/utils/sox_utils.py +19 -0
  39. torchaudio/version.py +2 -2
  40. {torchaudio-2.7.1.dist-info → torchaudio-2.8.0.dist-info}/METADATA +13 -2
  41. {torchaudio-2.7.1.dist-info → torchaudio-2.8.0.dist-info}/RECORD +52 -51
  42. torio/io/_streaming_media_decoder.py +0 -1
  43. torio/lib/_torio_ffmpeg4.pyd +0 -0
  44. torio/lib/_torio_ffmpeg5.pyd +0 -0
  45. torio/lib/_torio_ffmpeg6.pyd +0 -0
  46. torio/lib/libtorio_ffmpeg4.pyd +0 -0
  47. torio/lib/libtorio_ffmpeg5.pyd +0 -0
  48. torio/lib/libtorio_ffmpeg6.pyd +0 -0
  49. torio/utils/ffmpeg_utils.py +28 -0
  50. {torchaudio-2.7.1.dist-info → torchaudio-2.8.0.dist-info}/WHEEL +0 -0
  51. {torchaudio-2.7.1.dist-info → torchaudio-2.8.0.dist-info}/licenses/LICENSE +0 -0
  52. {torchaudio-2.7.1.dist-info → torchaudio-2.8.0.dist-info}/top_level.txt +0 -0
torchaudio/io/__init__.py CHANGED
@@ -1,7 +1,14 @@
1
- from torio.io import CodecConfig, StreamingMediaDecoder as StreamReader, StreamingMediaEncoder as StreamWriter
1
+ from torio.io import CodecConfig as _CodecConfig, StreamingMediaDecoder as _StreamReader, StreamingMediaEncoder as _StreamWriter
2
+ from torchaudio._internal.module_utils import dropping_class_io_support, dropping_class_support, dropping_io_support
2
3
 
3
- from ._effector import AudioEffector
4
- from ._playback import play_audio
4
+ from ._effector import AudioEffector as _AudioEffector
5
+ from ._playback import play_audio as _play_audio
6
+
7
+ CodecConfig = dropping_class_io_support(_CodecConfig)
8
+ StreamReader = dropping_class_io_support(_StreamReader)
9
+ StreamWriter = dropping_class_io_support(_StreamWriter)
10
+ AudioEffector = dropping_class_support(_AudioEffector)
11
+ play_audio = dropping_io_support(_play_audio)
5
12
 
6
13
 
7
14
  __all__ = [
torchaudio/kaldi_io.py CHANGED
@@ -6,6 +6,7 @@ from typing import Any, Callable, Iterable, Tuple
6
6
  import torch
7
7
  from torch import Tensor
8
8
  from torchaudio._internal import module_utils as _mod_utils
9
+ from torchaudio._internal.module_utils import dropping_support
9
10
 
10
11
  if _mod_utils.is_module_available("numpy"):
11
12
  import numpy as np
@@ -41,6 +42,7 @@ def _convert_method_output_to_tensor(
41
42
  yield key, torch.from_numpy(np_arr)
42
43
 
43
44
 
45
+ @dropping_support
44
46
  @_mod_utils.requires_module("kaldi_io", "numpy")
45
47
  def read_vec_int_ark(file_or_fd: Any) -> Iterable[Tuple[str, Tensor]]:
46
48
  r"""Create generator of (key,vector<int>) tuples, which reads from the ark file/stream.
@@ -64,6 +66,7 @@ def read_vec_int_ark(file_or_fd: Any) -> Iterable[Tuple[str, Tensor]]:
64
66
  return _convert_method_output_to_tensor(file_or_fd, kaldi_io.read_vec_int_ark, convert_contiguous=True)
65
67
 
66
68
 
69
+ @dropping_support
67
70
  @_mod_utils.requires_module("kaldi_io", "numpy")
68
71
  def read_vec_flt_scp(file_or_fd: Any) -> Iterable[Tuple[str, Tensor]]:
69
72
  r"""Create generator of (key,vector<float32/float64>) tuples, read according to Kaldi scp.
@@ -84,6 +87,7 @@ def read_vec_flt_scp(file_or_fd: Any) -> Iterable[Tuple[str, Tensor]]:
84
87
  return _convert_method_output_to_tensor(file_or_fd, kaldi_io.read_vec_flt_scp)
85
88
 
86
89
 
90
+ @dropping_support
87
91
  @_mod_utils.requires_module("kaldi_io", "numpy")
88
92
  def read_vec_flt_ark(file_or_fd: Any) -> Iterable[Tuple[str, Tensor]]:
89
93
  r"""Create generator of (key,vector<float32/float64>) tuples, which reads from the ark file/stream.
@@ -104,6 +108,7 @@ def read_vec_flt_ark(file_or_fd: Any) -> Iterable[Tuple[str, Tensor]]:
104
108
  return _convert_method_output_to_tensor(file_or_fd, kaldi_io.read_vec_flt_ark)
105
109
 
106
110
 
111
+ @dropping_support
107
112
  @_mod_utils.requires_module("kaldi_io", "numpy")
108
113
  def read_mat_scp(file_or_fd: Any) -> Iterable[Tuple[str, Tensor]]:
109
114
  r"""Create generator of (key,matrix<float32/float64>) tuples, read according to Kaldi scp.
@@ -124,6 +129,7 @@ def read_mat_scp(file_or_fd: Any) -> Iterable[Tuple[str, Tensor]]:
124
129
  return _convert_method_output_to_tensor(file_or_fd, kaldi_io.read_mat_scp)
125
130
 
126
131
 
132
+ @dropping_support
127
133
  @_mod_utils.requires_module("kaldi_io", "numpy")
128
134
  def read_mat_ark(file_or_fd: Any) -> Iterable[Tuple[str, Tensor]]:
129
135
  r"""Create generator of (key,matrix<float32/float64>) tuples, which reads from the ark file/stream.
Binary file
Binary file
@@ -1,3 +1,5 @@
1
+ from torchaudio._internal.module_utils import dropping_support, dropping_class_support
2
+ import inspect
1
3
  _CTC_DECODERS = [
2
4
  "CTCHypothesis",
3
5
  "CTCDecoder",
@@ -33,7 +35,11 @@ def __getattr__(name: str):
33
35
  "To use CUCTC decoder, please set BUILD_CUDA_CTC_DECODER=1 when building from source."
34
36
  ) from err
35
37
 
36
- item = getattr(_cuda_ctc_decoder, name)
38
+ orig_item = getattr(_cuda_ctc_decoder, name)
39
+ if inspect.isclass(orig_item):
40
+ item = dropping_class_support(orig_item)
41
+ else:
42
+ item = dropping_support(orig_item)
37
43
  globals()[name] = item
38
44
  return item
39
45
  raise AttributeError(f"module {__name__} has no attribute {name}")
@@ -161,6 +161,7 @@ def _load_phonemizer(file, dl_kwargs):
161
161
  raise RuntimeError("DeepPhonemizer is not installed. Please install it.")
162
162
 
163
163
  from dp.phonemizer import Phonemizer
164
+ from dp.preprocessing.text import Preprocessor, LanguageTokenizer, SequenceTokenizer
164
165
 
165
166
  # By default, dp issues DEBUG level log.
166
167
  logger = logging.getLogger("dp")
@@ -174,7 +175,8 @@ def _load_phonemizer(file, dl_kwargs):
174
175
  if not os.path.exists(path):
175
176
  dl_kwargs = {} if dl_kwargs is None else dl_kwargs
176
177
  download_url_to_file(url, path, **dl_kwargs)
177
- return Phonemizer.from_checkpoint(path)
178
+ with torch.serialization.safe_globals([Preprocessor, LanguageTokenizer, SequenceTokenizer]):
179
+ return Phonemizer.from_checkpoint(path)
178
180
  finally:
179
181
  logger.setLevel(orig_level)
180
182
 
@@ -4,12 +4,13 @@ from typing import Tuple, Union
4
4
  import torch
5
5
  from torch.utils.data import Dataset
6
6
  from torchaudio.datasets.utils import _load_waveform
7
+ from torchaudio._internal.module_utils import dropping_support, dropping_class_support
7
8
 
8
9
 
9
10
  _SUBSETS = ["music", "noise", "speech"]
10
11
  _SAMPLE_RATE = 16_000
11
12
 
12
-
13
+ @dropping_class_support
13
14
  class Musan(Dataset):
14
15
  r"""*MUSAN* :cite:`musan2015` dataset.
15
16
 
@@ -4,8 +4,10 @@ from typing import List, Optional, Union
4
4
  import torch
5
5
 
6
6
  from torchaudio.functional import fftconvolve
7
+ from torchaudio._internal.module_utils import dropping_support
7
8
 
8
9
 
10
+ @dropping_support
9
11
  def oscillator_bank(
10
12
  frequencies: torch.Tensor,
11
13
  amplitudes: torch.Tensor,
@@ -81,6 +83,7 @@ def oscillator_bank(
81
83
  return waveform
82
84
 
83
85
 
86
+ @dropping_support
84
87
  def adsr_envelope(
85
88
  num_frames: int,
86
89
  *,
@@ -182,6 +185,7 @@ def adsr_envelope(
182
185
  return out
183
186
 
184
187
 
188
+ @dropping_support
185
189
  def extend_pitch(
186
190
  base: torch.Tensor,
187
191
  pattern: Union[int, List[float], torch.Tensor],
@@ -249,6 +253,7 @@ def extend_pitch(
249
253
  return h_freq
250
254
 
251
255
 
256
+ @dropping_support
252
257
  def sinc_impulse_response(cutoff: torch.Tensor, window_size: int = 513, high_pass: bool = False):
253
258
  """Create windowed-sinc impulse response for given cutoff frequencies.
254
259
 
@@ -288,6 +293,7 @@ def sinc_impulse_response(cutoff: torch.Tensor, window_size: int = 513, high_pas
288
293
  return filt
289
294
 
290
295
 
296
+ @dropping_support
291
297
  def frequency_impulse_response(magnitudes):
292
298
  """Create filter from desired frequency response
293
299
 
@@ -319,6 +325,7 @@ def _overlap_and_add(waveform, stride):
319
325
  return buffer
320
326
 
321
327
 
328
+ @dropping_support
322
329
  def filter_waveform(waveform: torch.Tensor, kernels: torch.Tensor, delay_compensation: int = -1):
323
330
  """Applies filters along time axis of the given waveform.
324
331
 
@@ -404,6 +411,7 @@ def filter_waveform(waveform: torch.Tensor, kernels: torch.Tensor, delay_compens
404
411
  return result
405
412
 
406
413
 
414
+ @dropping_support
407
415
  def exp_sigmoid(
408
416
  input: torch.Tensor, exponent: float = 10.0, max_value: float = 2.0, threshold: float = 1e-7
409
417
  ) -> torch.Tensor:
@@ -1,5 +1,6 @@
1
1
  import math
2
2
  from typing import Optional, Tuple, Union
3
+ from torchaudio._internal.module_utils import dropping_support
3
4
 
4
5
  import torch
5
6
  import torchaudio
@@ -176,6 +177,7 @@ def _validate_inputs(
176
177
  raise ValueError(f"`mic_array` must be a 2D Tensor with shape (num_channels, 3). Found {mic_array.shape}.")
177
178
 
178
179
 
180
+ @dropping_support
179
181
  def simulate_rir_ism(
180
182
  room: torch.Tensor,
181
183
  source: torch.Tensor,
@@ -276,6 +278,7 @@ def simulate_rir_ism(
276
278
  return rir
277
279
 
278
280
 
281
+ @dropping_support
279
282
  def ray_tracing(
280
283
  room: torch.Tensor,
281
284
  source: torch.Tensor,
@@ -4,6 +4,7 @@ from typing import Optional
4
4
 
5
5
  import torch
6
6
  from torchaudio.functional.functional import _create_triangular_filterbank
7
+ from torchaudio._internal.module_utils import dropping_support
7
8
 
8
9
 
9
10
  def _hz_to_bark(freqs: float, bark_scale: str = "traunmuller") -> float:
@@ -72,6 +73,7 @@ def _hz_to_octs(freqs, tuning=0.0, bins_per_octave=12):
72
73
  return torch.log2(freqs / (a440 / 16))
73
74
 
74
75
 
76
+ @dropping_support
75
77
  def barkscale_fbanks(
76
78
  n_freqs: int,
77
79
  f_min: float,
@@ -129,6 +131,7 @@ def barkscale_fbanks(
129
131
  return fb
130
132
 
131
133
 
134
+ @dropping_support
132
135
  def chroma_filterbank(
133
136
  sample_rate: int,
134
137
  n_freqs: int,
@@ -1,3 +1,4 @@
1
+ from torchaudio._internal.module_utils import dropping_const_support
1
2
  from ._conformer_wav2vec2 import (
2
3
  conformer_wav2vec2_base,
3
4
  conformer_wav2vec2_model,
@@ -10,7 +11,9 @@ from ._emformer_hubert import emformer_hubert_base, emformer_hubert_model
10
11
  from .conv_emformer import ConvEmformer
11
12
  from .hifi_gan import hifigan_vocoder, hifigan_vocoder_v1, hifigan_vocoder_v2, hifigan_vocoder_v3, HiFiGANVocoder
12
13
  from .rnnt import conformer_rnnt_base, conformer_rnnt_biasing, conformer_rnnt_biasing_base, conformer_rnnt_model
13
- from .rnnt_decoder import Hypothesis, RNNTBeamSearchBiasing
14
+ from .rnnt_decoder import Hypothesis as _Hypothesis, RNNTBeamSearchBiasing
15
+
16
+ Hypothesis = dropping_const_support(_Hypothesis, name="Hypothesis")
14
17
 
15
18
  __all__ = [
16
19
  "conformer_rnnt_base",
@@ -7,6 +7,7 @@ from torchaudio.models import Wav2Vec2Model
7
7
  from torchaudio.models.conformer import ConformerLayer
8
8
  from torchaudio.models.rnnt import _TimeReduction
9
9
  from torchaudio.models.wav2vec2 import components
10
+ from torchaudio._internal.module_utils import dropping_class_support, dropping_support
10
11
 
11
12
 
12
13
  def _buffered_arange(max) -> Tensor:
@@ -252,6 +253,7 @@ class ConformerEncoder(Module):
252
253
  return self._get_intermediate_outputs(x, mask=masks, num_layers=num_layers)
253
254
 
254
255
 
256
+ @dropping_class_support
255
257
  class ConformerWav2Vec2PretrainModel(Module):
256
258
  """Conformer Wav2Vec2 pre-train model for training from scratch.
257
259
 
@@ -437,6 +439,7 @@ def _get_conformer_negativer_sampler(
437
439
  return NegativeSampler(preprocessor, num_negatives, cross_sample_negatives)
438
440
 
439
441
 
442
+ @dropping_support
440
443
  def conformer_wav2vec2_model(
441
444
  extractor_input_dim: int,
442
445
  extractor_output_dim: int,
@@ -501,6 +504,7 @@ def conformer_wav2vec2_model(
501
504
  return Wav2Vec2Model(feature_extractor, encoder)
502
505
 
503
506
 
507
+ @dropping_support
504
508
  def conformer_wav2vec2_base(
505
509
  extractor_input_dim: int = 64,
506
510
  extractor_output_dim: int = 256,
@@ -536,6 +540,7 @@ def conformer_wav2vec2_base(
536
540
  )
537
541
 
538
542
 
543
+ @dropping_support
539
544
  def conformer_wav2vec2_pretrain_model(
540
545
  extractor_input_dim: int,
541
546
  extractor_output_dim: int,
@@ -672,6 +677,7 @@ def conformer_wav2vec2_pretrain_model(
672
677
  )
673
678
 
674
679
 
680
+ @dropping_support
675
681
  def conformer_wav2vec2_pretrain_base(
676
682
  extractor_input_dim: int = 64,
677
683
  extractor_output_dim: int = 256,
@@ -733,6 +739,7 @@ def conformer_wav2vec2_pretrain_base(
733
739
  )
734
740
 
735
741
 
742
+ @dropping_support
736
743
  def conformer_wav2vec2_pretrain_large(
737
744
  extractor_input_dim: int = 64,
738
745
  extractor_output_dim: int = 256,
@@ -4,6 +4,8 @@ import torch
4
4
  from torchaudio.models import Wav2Vec2Model
5
5
  from torchaudio.models.emformer import Emformer
6
6
  from torchaudio.models.rnnt import _TimeReduction
7
+ from torchaudio._internal.module_utils import dropping_support
8
+
7
9
 
8
10
 
9
11
  class FeatureEncoder(torch.nn.Module):
@@ -217,6 +219,7 @@ def _get_emformer_encoder(
217
219
  return EmformerEncoder(emformer, output_linear, layer_norm)
218
220
 
219
221
 
222
+ @dropping_support
220
223
  def emformer_hubert_model(
221
224
  extractor_input_dim: int,
222
225
  extractor_output_dim: int,
@@ -292,6 +295,7 @@ def emformer_hubert_model(
292
295
  return Wav2Vec2Model(feature_extractor, emformer, aux)
293
296
 
294
297
 
298
+ @dropping_support
295
299
  def emformer_hubert_base(
296
300
  extractor_input_dim: int = 80,
297
301
  extractor_output_dim: int = 128,
@@ -3,6 +3,8 @@ from typing import List, Optional, Tuple
3
3
 
4
4
  import torch
5
5
  from torchaudio.models.emformer import _EmformerAttention, _EmformerImpl, _get_weight_init_gains
6
+ from torchaudio._internal.module_utils import dropping_class_support, dropping_support
7
+
6
8
 
7
9
 
8
10
  def _get_activation_module(activation: str) -> torch.nn.Module:
@@ -441,6 +443,7 @@ class _ConvEmformerLayer(torch.nn.Module):
441
443
  return output_utterance, output_right_context, output_state, next_m
442
444
 
443
445
 
446
+ @dropping_class_support
444
447
  class ConvEmformer(_EmformerImpl):
445
448
  r"""Implements the convolution-augmented streaming transformer architecture introduced in
446
449
  *Streaming Transformer Transducer based Speech Recognition Using Non-Causal Convolution*
@@ -476,6 +479,7 @@ class ConvEmformer(_EmformerImpl):
476
479
  >>> output, lengths, states = conv_emformer.infer(input, lengths, None)
477
480
  """
478
481
 
482
+ @dropping_support
479
483
  def __init__(
480
484
  self,
481
485
  input_dim: int,
@@ -28,8 +28,10 @@ import torch
28
28
  import torch.nn as nn
29
29
  import torch.nn.functional as F
30
30
  from torch.nn import Conv1d, ConvTranspose1d
31
+ from torchaudio._internal.module_utils import dropping_class_support, dropping_support
31
32
 
32
33
 
34
+ @dropping_class_support
33
35
  class HiFiGANVocoder(torch.nn.Module):
34
36
  """Generator part of *HiFi GAN* :cite:`NEURIPS2020_c5d73680`.
35
37
  Source: https://github.com/jik876/hifi-gan/blob/4769534d45265d52a904b850da5a622601885777/models.py#L75
@@ -246,6 +248,7 @@ def get_padding(kernel_size, dilation=1):
246
248
  return int((kernel_size * dilation - dilation) / 2)
247
249
 
248
250
 
251
+ @dropping_support
249
252
  def hifigan_vocoder(
250
253
  in_channels: int,
251
254
  upsample_rates: Tuple[int, ...],
@@ -282,6 +285,7 @@ def hifigan_vocoder(
282
285
  )
283
286
 
284
287
 
288
+ @dropping_support
285
289
  def hifigan_vocoder_v1() -> HiFiGANVocoder:
286
290
  r"""Builds HiFiGAN Vocoder with V1 architecture :cite:`NEURIPS2020_c5d73680`.
287
291
 
@@ -300,6 +304,7 @@ def hifigan_vocoder_v1() -> HiFiGANVocoder:
300
304
  )
301
305
 
302
306
 
307
+ @dropping_support
303
308
  def hifigan_vocoder_v2() -> HiFiGANVocoder:
304
309
  r"""Builds HiFiGAN Vocoder with V2 architecture :cite:`NEURIPS2020_c5d73680`.
305
310
 
@@ -318,6 +323,7 @@ def hifigan_vocoder_v2() -> HiFiGANVocoder:
318
323
  )
319
324
 
320
325
 
326
+ @dropping_support
321
327
  def hifigan_vocoder_v3() -> HiFiGANVocoder:
322
328
  r"""Builds HiFiGAN Vocoder with V3 architecture :cite:`NEURIPS2020_c5d73680`.
323
329
 
@@ -5,6 +5,8 @@ import torch
5
5
  from torchaudio.models import Conformer, RNNT
6
6
  from torchaudio.models.rnnt import _Joiner, _Predictor, _TimeReduction, _Transcriber
7
7
 
8
+ from torchaudio._internal.module_utils import dropping_support
9
+
8
10
 
9
11
  TrieNode = Tuple[Dict[int, "TrieNode"], int, Optional[Tuple[int, int]]]
10
12
 
@@ -472,6 +474,7 @@ class RNNTBiasing(RNNT):
472
474
  return output, source_lengths, jointer_activation
473
475
 
474
476
 
477
+ @dropping_support
475
478
  def conformer_rnnt_model(
476
479
  *,
477
480
  input_dim: int,
@@ -544,6 +547,7 @@ def conformer_rnnt_model(
544
547
  return RNNT(encoder, predictor, joiner)
545
548
 
546
549
 
550
+ @dropping_support
547
551
  def conformer_rnnt_base() -> RNNT:
548
552
  r"""Builds basic version of Conformer RNN-T model.
549
553
 
@@ -572,6 +576,7 @@ def conformer_rnnt_base() -> RNNT:
572
576
  )
573
577
 
574
578
 
579
+ @dropping_support
575
580
  def conformer_rnnt_biasing(
576
581
  *,
577
582
  input_dim: int,
@@ -677,6 +682,7 @@ def conformer_rnnt_biasing(
677
682
  )
678
683
 
679
684
 
685
+ @dropping_support
680
686
  def conformer_rnnt_biasing_base(charlist=None, biasing=True) -> RNNT:
681
687
  r"""Builds basic version of Conformer RNN-T model with TCPGen.
682
688
 
@@ -4,6 +4,8 @@ import torch
4
4
  from torchaudio.models import RNNT
5
5
  from torchaudio.prototype.models.rnnt import TrieNode
6
6
 
7
+ from torchaudio._internal.module_utils import dropping_class_support
8
+
7
9
  __all__ = ["Hypothesis", "RNNTBeamSearchBiasing"]
8
10
 
9
11
 
@@ -80,6 +82,7 @@ def _remove_hypo(hypo: Hypothesis, hypo_list: List[Hypothesis]) -> None:
80
82
  break
81
83
 
82
84
 
85
+ @dropping_class_support
83
86
  class RNNTBeamSearchBiasing(torch.nn.Module):
84
87
  r"""Beam search decoder for RNN-T model with biasing support.
85
88
 
@@ -1,6 +1,15 @@
1
1
  from ._vggish import VGGISH, VGGishBundle
2
- from .hifigan_pipeline import HIFIGAN_VOCODER_V3_LJSPEECH, HiFiGANVocoderBundle
3
- from .rnnt_pipeline import EMFORMER_RNNT_BASE_MUSTC, EMFORMER_RNNT_BASE_TEDLIUM3
2
+ from .hifigan_pipeline import HIFIGAN_VOCODER_V3_LJSPEECH as _HIFIGAN_VOCODER_V3_LJSPEECH, HiFiGANVocoderBundle
3
+ from .rnnt_pipeline import (
4
+ EMFORMER_RNNT_BASE_MUSTC as _EMFORMER_RNNT_BASE_MUSTC,
5
+ EMFORMER_RNNT_BASE_TEDLIUM3 as _EMFORMER_RNNT_BASE_TEDLIUM3
6
+ )
7
+ from torchaudio._internal.module_utils import dropping_const_support
8
+
9
+ EMFORMER_RNNT_BASE_MUSTC = dropping_const_support(_EMFORMER_RNNT_BASE_MUSTC)
10
+ EMFORMER_RNNT_BASE_TEDLIUM3 = dropping_const_support(_EMFORMER_RNNT_BASE_TEDLIUM3)
11
+ HIFIGAN_VOCODER_V3_LJSPEECH = dropping_const_support(_HIFIGAN_VOCODER_V3_LJSPEECH)
12
+
4
13
 
5
14
  __all__ = [
6
15
  "EMFORMER_RNNT_BASE_MUSTC",
@@ -1,3 +1,7 @@
1
- from ._vggish_pipeline import VGGISH, VGGishBundle
1
+ from ._vggish_pipeline import VGGISH as _VGGISH, VGGishBundle
2
+ from torchaudio._internal.module_utils import dropping_const_support
3
+
4
+
5
+ VGGISH = dropping_const_support(_VGGISH, "VGGISH")
2
6
 
3
7
  __all__ = ["VGGISH", "VGGishBundle"]
@@ -18,6 +18,8 @@ import math
18
18
 
19
19
  import torch
20
20
 
21
+ from torchaudio._internal.module_utils import dropping_class_support
22
+
21
23
 
22
24
  _MEL_BREAK_FREQUENCY_HERTZ = 700.0
23
25
  _MEL_HIGH_FREQUENCY_Q = 1127.0
@@ -191,6 +193,7 @@ def _waveform_to_examples(data):
191
193
  return log_mel_examples.unsqueeze(1)
192
194
 
193
195
 
196
+ @dropping_class_support
194
197
  class VGGish(torch.nn.Module):
195
198
  """Implementation of VGGish model :cite:`45611`."""
196
199
 
@@ -215,7 +218,7 @@ class VGGish(torch.nn.Module):
215
218
 
216
219
  return self.embedding_network(x)
217
220
 
218
-
221
+ @dropping_class_support
219
222
  class VGGishInputProcessor:
220
223
  """Converts raw waveforms to batches of examples to use as inputs to VGGish."""
221
224
 
@@ -1,8 +1,8 @@
1
1
  from dataclasses import dataclass
2
2
  from typing import Callable, Dict
3
3
 
4
- import torch
5
- import torchaudio
4
+ from torchaudio._internal.module_utils import dropping_class_support
5
+
6
6
 
7
7
  from ._vggish_impl import _SAMPLE_RATE, VGGish as _VGGish, VGGishInputProcessor as _VGGishInputProcessor
8
8
 
@@ -12,6 +12,7 @@ def _get_state_dict():
12
12
  return torch.load(path)
13
13
 
14
14
 
15
+ @dropping_class_support
15
16
  @dataclass
16
17
  class VGGishBundle:
17
18
  """VGGish :cite:`45611` inference pipeline ported from
@@ -9,7 +9,10 @@ from torchaudio._internal import load_state_dict_from_url
9
9
  from torchaudio.prototype.models.hifi_gan import hifigan_vocoder, HiFiGANVocoder
10
10
  from torchaudio.transforms import MelSpectrogram
11
11
 
12
+ from torchaudio._internal.module_utils import dropping_support, dropping_class_support
12
13
 
14
+
15
+ @dropping_class_support
13
16
  @dataclass
14
17
  class HiFiGANVocoderBundle:
15
18
  """Data class that bundles associated information to use pretrained
@@ -82,6 +85,7 @@ class HiFiGANVocoderBundle:
82
85
  state_dict = load_state_dict_from_url(url, **dl_kwargs)
83
86
  return state_dict
84
87
 
88
+ @dropping_support
85
89
  def get_vocoder(self, *, dl_kwargs=None) -> HiFiGANVocoder:
86
90
  """Construct the HiFiGAN Generator model, which can be used a vocoder, and load the pretrained weight.
87
91
 
@@ -99,6 +103,7 @@ class HiFiGANVocoderBundle:
99
103
  model.eval()
100
104
  return model
101
105
 
106
+ @dropping_support
102
107
  def get_mel_transform(self) -> Module:
103
108
  """Construct an object which transforms waveforms into mel spectrograms."""
104
109
  return _HiFiGANMelSpectrogram(
@@ -3,8 +3,9 @@ from typing import Callable, Optional
3
3
  import torch
4
4
  from torchaudio.prototype.functional import barkscale_fbanks, chroma_filterbank
5
5
  from torchaudio.transforms import Spectrogram
6
+ from torchaudio._internal.module_utils import dropping_support, dropping_class_support
6
7
 
7
-
8
+ @dropping_class_support
8
9
  class BarkScale(torch.nn.Module):
9
10
  r"""Turn a normal STFT into a bark frequency STFT with triangular filter banks.
10
11
 
@@ -72,6 +73,7 @@ class BarkScale(torch.nn.Module):
72
73
  return bark_specgram
73
74
 
74
75
 
76
+ @dropping_class_support
75
77
  class InverseBarkScale(torch.nn.Module):
76
78
  r"""Estimate a STFT in normal frequency domain from bark frequency domain.
77
79
 
@@ -188,6 +190,7 @@ class InverseBarkScale(torch.nn.Module):
188
190
  return specgram
189
191
 
190
192
 
193
+ @dropping_class_support
191
194
  class BarkSpectrogram(torch.nn.Module):
192
195
  r"""Create BarkSpectrogram for a raw audio signal.
193
196
 
@@ -297,6 +300,7 @@ class BarkSpectrogram(torch.nn.Module):
297
300
  return bark_specgram
298
301
 
299
302
 
303
+ @dropping_class_support
300
304
  class ChromaScale(torch.nn.Module):
301
305
  r"""Converts spectrogram to chromagram.
302
306
 
@@ -356,6 +360,7 @@ class ChromaScale(torch.nn.Module):
356
360
  return torch.matmul(x.transpose(-1, -2), self.fb).transpose(-1, -2)
357
361
 
358
362
 
363
+ @dropping_class_support
359
364
  class ChromaSpectrogram(torch.nn.Module):
360
365
  r"""Generates chromagram for audio signal.
361
366
 
@@ -3,7 +3,7 @@ from typing import List, Optional, Tuple
3
3
 
4
4
  import torch
5
5
  import torchaudio
6
- from torchaudio._internal.module_utils import deprecated
6
+ from torchaudio._internal.module_utils import deprecated, dropping_support
7
7
  from torchaudio.utils.sox_utils import list_effects
8
8
 
9
9
 
@@ -39,6 +39,7 @@ def shutdown_sox_effects():
39
39
  pass
40
40
 
41
41
 
42
+ @dropping_support
42
43
  def effect_names() -> List[str]:
43
44
  """Gets list of valid sox effect names
44
45
 
@@ -52,6 +53,7 @@ def effect_names() -> List[str]:
52
53
  return list(list_effects().keys())
53
54
 
54
55
 
56
+ @dropping_support
55
57
  def apply_effects_tensor(
56
58
  tensor: torch.Tensor,
57
59
  sample_rate: int,
@@ -156,6 +158,7 @@ def apply_effects_tensor(
156
158
  return sox_ext.apply_effects_tensor(tensor, sample_rate, effects, channels_first)
157
159
 
158
160
 
161
+ @dropping_support
159
162
  def apply_effects_file(
160
163
  path: str,
161
164
  effects: List[List[str]],
@@ -1,3 +1,4 @@
1
+ from torchaudio._internal.module_utils import dropping_class_support
1
2
  from ._multi_channel import MVDR, PSD, RTFMVDR, SoudenMVDR
2
3
  from ._transforms import (
3
4
  AddNoise,
@@ -21,7 +22,7 @@ from ._transforms import (
21
22
  PitchShift,
22
23
  Preemphasis,
23
24
  Resample,
24
- RNNTLoss,
25
+ RNNTLoss as _RNNTLoss,
25
26
  SlidingWindowCmn,
26
27
  SpecAugment,
27
28
  SpectralCentroid,
@@ -34,6 +35,7 @@ from ._transforms import (
34
35
  Vol,
35
36
  )
36
37
 
38
+ RNNTLoss = dropping_class_support(_RNNTLoss)
37
39
 
38
40
  __all__ = [
39
41
  "AddNoise",
@@ -10,6 +10,7 @@ from torch.nn.modules.lazy import LazyModuleMixin
10
10
  from torch.nn.parameter import UninitializedParameter
11
11
 
12
12
  from torchaudio import functional as F
13
+ from torchaudio.functional.functional import _rnnt_loss
13
14
  from torchaudio.functional.functional import (
14
15
  _apply_sinc_resample_kernel,
15
16
  _check_convolve_mode,
@@ -1846,7 +1847,7 @@ class RNNTLoss(torch.nn.Module):
1846
1847
  Tensor: Loss with the reduction option applied. If ``reduction`` is ``"none"``, then size (batch),
1847
1848
  otherwise scalar.
1848
1849
  """
1849
- return F.rnnt_loss(
1850
+ return _rnnt_loss(
1850
1851
  logits,
1851
1852
  targets,
1852
1853
  logit_lengths,
@@ -2134,4 +2135,4 @@ class Deemphasis(torch.nn.Module):
2134
2135
  Returns:
2135
2136
  torch.Tensor: De-emphasized waveform, with shape `(..., N)`.
2136
2137
  """
2137
- return F.deemphasis(waveform, coeff=self.coeff)
2138
+ return F.functional.deemphasis(waveform, coeff=self.coeff)