torchaudio 2.7.0__cp312-cp312-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchaudio might be problematic. Click here for more details.

Files changed (148) hide show
  1. torchaudio/__init__.py +53 -0
  2. torchaudio/_backend/__init__.py +61 -0
  3. torchaudio/_backend/backend.py +53 -0
  4. torchaudio/_backend/common.py +52 -0
  5. torchaudio/_backend/ffmpeg.py +334 -0
  6. torchaudio/_backend/soundfile.py +54 -0
  7. torchaudio/_backend/soundfile_backend.py +457 -0
  8. torchaudio/_backend/sox.py +91 -0
  9. torchaudio/_backend/utils.py +317 -0
  10. torchaudio/_extension/__init__.py +74 -0
  11. torchaudio/_extension/utils.py +180 -0
  12. torchaudio/_internal/__init__.py +10 -0
  13. torchaudio/_internal/module_utils.py +113 -0
  14. torchaudio/backend/__init__.py +8 -0
  15. torchaudio/backend/_no_backend.py +25 -0
  16. torchaudio/backend/_sox_io_backend.py +294 -0
  17. torchaudio/backend/common.py +13 -0
  18. torchaudio/backend/no_backend.py +14 -0
  19. torchaudio/backend/soundfile_backend.py +14 -0
  20. torchaudio/backend/sox_io_backend.py +14 -0
  21. torchaudio/compliance/__init__.py +5 -0
  22. torchaudio/compliance/kaldi.py +813 -0
  23. torchaudio/datasets/__init__.py +47 -0
  24. torchaudio/datasets/cmuarctic.py +157 -0
  25. torchaudio/datasets/cmudict.py +186 -0
  26. torchaudio/datasets/commonvoice.py +86 -0
  27. torchaudio/datasets/dr_vctk.py +121 -0
  28. torchaudio/datasets/fluentcommands.py +108 -0
  29. torchaudio/datasets/gtzan.py +1118 -0
  30. torchaudio/datasets/iemocap.py +147 -0
  31. torchaudio/datasets/librilight_limited.py +111 -0
  32. torchaudio/datasets/librimix.py +133 -0
  33. torchaudio/datasets/librispeech.py +174 -0
  34. torchaudio/datasets/librispeech_biasing.py +189 -0
  35. torchaudio/datasets/libritts.py +168 -0
  36. torchaudio/datasets/ljspeech.py +107 -0
  37. torchaudio/datasets/musdb_hq.py +139 -0
  38. torchaudio/datasets/quesst14.py +136 -0
  39. torchaudio/datasets/snips.py +157 -0
  40. torchaudio/datasets/speechcommands.py +183 -0
  41. torchaudio/datasets/tedlium.py +218 -0
  42. torchaudio/datasets/utils.py +54 -0
  43. torchaudio/datasets/vctk.py +143 -0
  44. torchaudio/datasets/voxceleb1.py +309 -0
  45. torchaudio/datasets/yesno.py +89 -0
  46. torchaudio/functional/__init__.py +127 -0
  47. torchaudio/functional/_alignment.py +128 -0
  48. torchaudio/functional/filtering.py +1670 -0
  49. torchaudio/functional/functional.py +2535 -0
  50. torchaudio/io/__init__.py +13 -0
  51. torchaudio/io/_effector.py +347 -0
  52. torchaudio/io/_playback.py +72 -0
  53. torchaudio/kaldi_io.py +144 -0
  54. torchaudio/lib/__init__.py +0 -0
  55. torchaudio/lib/_torchaudio.so +0 -0
  56. torchaudio/lib/_torchaudio_sox.so +0 -0
  57. torchaudio/lib/libctc_prefix_decoder.so +0 -0
  58. torchaudio/lib/libtorchaudio.so +0 -0
  59. torchaudio/lib/libtorchaudio_sox.so +0 -0
  60. torchaudio/lib/pybind11_prefixctc.so +0 -0
  61. torchaudio/models/__init__.py +85 -0
  62. torchaudio/models/_hdemucs.py +1008 -0
  63. torchaudio/models/conformer.py +293 -0
  64. torchaudio/models/conv_tasnet.py +330 -0
  65. torchaudio/models/decoder/__init__.py +46 -0
  66. torchaudio/models/decoder/_ctc_decoder.py +568 -0
  67. torchaudio/models/decoder/_cuda_ctc_decoder.py +187 -0
  68. torchaudio/models/deepspeech.py +84 -0
  69. torchaudio/models/emformer.py +884 -0
  70. torchaudio/models/rnnt.py +816 -0
  71. torchaudio/models/rnnt_decoder.py +339 -0
  72. torchaudio/models/squim/__init__.py +11 -0
  73. torchaudio/models/squim/objective.py +326 -0
  74. torchaudio/models/squim/subjective.py +150 -0
  75. torchaudio/models/tacotron2.py +1046 -0
  76. torchaudio/models/wav2letter.py +72 -0
  77. torchaudio/models/wav2vec2/__init__.py +45 -0
  78. torchaudio/models/wav2vec2/components.py +1167 -0
  79. torchaudio/models/wav2vec2/model.py +1579 -0
  80. torchaudio/models/wav2vec2/utils/__init__.py +7 -0
  81. torchaudio/models/wav2vec2/utils/import_fairseq.py +213 -0
  82. torchaudio/models/wav2vec2/utils/import_huggingface.py +134 -0
  83. torchaudio/models/wav2vec2/wavlm_attention.py +214 -0
  84. torchaudio/models/wavernn.py +409 -0
  85. torchaudio/pipelines/__init__.py +102 -0
  86. torchaudio/pipelines/_source_separation_pipeline.py +109 -0
  87. torchaudio/pipelines/_squim_pipeline.py +156 -0
  88. torchaudio/pipelines/_tts/__init__.py +16 -0
  89. torchaudio/pipelines/_tts/impl.py +385 -0
  90. torchaudio/pipelines/_tts/interface.py +255 -0
  91. torchaudio/pipelines/_tts/utils.py +228 -0
  92. torchaudio/pipelines/_wav2vec2/__init__.py +0 -0
  93. torchaudio/pipelines/_wav2vec2/aligner.py +87 -0
  94. torchaudio/pipelines/_wav2vec2/impl.py +1699 -0
  95. torchaudio/pipelines/_wav2vec2/utils.py +346 -0
  96. torchaudio/pipelines/rnnt_pipeline.py +380 -0
  97. torchaudio/prototype/__init__.py +0 -0
  98. torchaudio/prototype/datasets/__init__.py +4 -0
  99. torchaudio/prototype/datasets/musan.py +67 -0
  100. torchaudio/prototype/functional/__init__.py +26 -0
  101. torchaudio/prototype/functional/_dsp.py +433 -0
  102. torchaudio/prototype/functional/_rir.py +379 -0
  103. torchaudio/prototype/functional/functional.py +190 -0
  104. torchaudio/prototype/models/__init__.py +36 -0
  105. torchaudio/prototype/models/_conformer_wav2vec2.py +794 -0
  106. torchaudio/prototype/models/_emformer_hubert.py +333 -0
  107. torchaudio/prototype/models/conv_emformer.py +525 -0
  108. torchaudio/prototype/models/hifi_gan.py +336 -0
  109. torchaudio/prototype/models/rnnt.py +711 -0
  110. torchaudio/prototype/models/rnnt_decoder.py +399 -0
  111. torchaudio/prototype/pipelines/__init__.py +12 -0
  112. torchaudio/prototype/pipelines/_vggish/__init__.py +3 -0
  113. torchaudio/prototype/pipelines/_vggish/_vggish_impl.py +233 -0
  114. torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py +82 -0
  115. torchaudio/prototype/pipelines/hifigan_pipeline.py +228 -0
  116. torchaudio/prototype/pipelines/rnnt_pipeline.py +58 -0
  117. torchaudio/prototype/transforms/__init__.py +9 -0
  118. torchaudio/prototype/transforms/_transforms.py +456 -0
  119. torchaudio/sox_effects/__init__.py +10 -0
  120. torchaudio/sox_effects/sox_effects.py +272 -0
  121. torchaudio/transforms/__init__.py +75 -0
  122. torchaudio/transforms/_multi_channel.py +467 -0
  123. torchaudio/transforms/_transforms.py +2137 -0
  124. torchaudio/utils/__init__.py +11 -0
  125. torchaudio/utils/download.py +89 -0
  126. torchaudio/utils/ffmpeg_utils.py +11 -0
  127. torchaudio/utils/sox_utils.py +99 -0
  128. torchaudio/version.py +2 -0
  129. torchaudio-2.7.0.dist-info/LICENSE +25 -0
  130. torchaudio-2.7.0.dist-info/METADATA +124 -0
  131. torchaudio-2.7.0.dist-info/RECORD +148 -0
  132. torchaudio-2.7.0.dist-info/WHEEL +5 -0
  133. torchaudio-2.7.0.dist-info/top_level.txt +2 -0
  134. torio/__init__.py +8 -0
  135. torio/_extension/__init__.py +13 -0
  136. torio/_extension/utils.py +147 -0
  137. torio/io/__init__.py +9 -0
  138. torio/io/_streaming_media_decoder.py +978 -0
  139. torio/io/_streaming_media_encoder.py +502 -0
  140. torio/lib/__init__.py +0 -0
  141. torio/lib/_torio_ffmpeg4.so +0 -0
  142. torio/lib/_torio_ffmpeg5.so +0 -0
  143. torio/lib/_torio_ffmpeg6.so +0 -0
  144. torio/lib/libtorio_ffmpeg4.so +0 -0
  145. torio/lib/libtorio_ffmpeg5.so +0 -0
  146. torio/lib/libtorio_ffmpeg6.so +0 -0
  147. torio/utils/__init__.py +4 -0
  148. torio/utils/ffmpeg_utils.py +247 -0
@@ -0,0 +1,272 @@
1
+ import os
2
+ from typing import List, Optional, Tuple
3
+
4
+ import torch
5
+ import torchaudio
6
+ from torchaudio._internal.module_utils import deprecated
7
+ from torchaudio.utils.sox_utils import list_effects
8
+
9
+
10
+ sox_ext = torchaudio._extension.lazy_import_sox_ext()
11
+
12
+
13
+ @deprecated("Please remove the call. This function is called automatically.")
14
+ def init_sox_effects():
15
+ """Initialize resources required to use sox effects.
16
+
17
+ Note:
18
+ You do not need to call this function manually. It is called automatically.
19
+
20
+ Once initialized, you do not need to call this function again across the multiple uses of
21
+ sox effects though it is safe to do so as long as :func:`shutdown_sox_effects` is not called yet.
22
+ Once :func:`shutdown_sox_effects` is called, you can no longer use SoX effects and initializing
23
+ again will result in error.
24
+ """
25
+ pass
26
+
27
+
28
+ @deprecated("Please remove the call. This function is called automatically.")
29
+ def shutdown_sox_effects():
30
+ """Clean up resources required to use sox effects.
31
+
32
+ Note:
33
+ You do not need to call this function manually. It is called automatically.
34
+
35
+ It is safe to call this function multiple times.
36
+ Once :py:func:`shutdown_sox_effects` is called, you can no longer use SoX effects and
37
+ initializing again will result in error.
38
+ """
39
+ pass
40
+
41
+
42
+ def effect_names() -> List[str]:
43
+ """Gets list of valid sox effect names
44
+
45
+ Returns:
46
+ List[str]: list of available effect names.
47
+
48
+ Example
49
+ >>> torchaudio.sox_effects.effect_names()
50
+ ['allpass', 'band', 'bandpass', ... ]
51
+ """
52
+ return list(list_effects().keys())
53
+
54
+
55
+ def apply_effects_tensor(
56
+ tensor: torch.Tensor,
57
+ sample_rate: int,
58
+ effects: List[List[str]],
59
+ channels_first: bool = True,
60
+ ) -> Tuple[torch.Tensor, int]:
61
+ """Apply sox effects to given Tensor
62
+
63
+ .. devices:: CPU
64
+
65
+ .. properties:: TorchScript
66
+
67
+ Note:
68
+ This function only works on CPU Tensors.
69
+ This function works in the way very similar to ``sox`` command, however there are slight
70
+ differences. For example, ``sox`` command adds certain effects automatically (such as
71
+ ``rate`` effect after ``speed`` and ``pitch`` and other effects), but this function does
72
+ only applies the given effects. (Therefore, to actually apply ``speed`` effect, you also
73
+ need to give ``rate`` effect with desired sampling rate.).
74
+
75
+ Args:
76
+ tensor (torch.Tensor): Input 2D CPU Tensor.
77
+ sample_rate (int): Sample rate
78
+ effects (List[List[str]]): List of effects.
79
+ channels_first (bool, optional): Indicates if the input Tensor's dimension is
80
+ `[channels, time]` or `[time, channels]`
81
+
82
+ Returns:
83
+ (Tensor, int): Resulting Tensor and sample rate.
84
+ The resulting Tensor has the same ``dtype`` as the input Tensor, and
85
+ the same channels order. The shape of the Tensor can be different based on the
86
+ effects applied. Sample rate can also be different based on the effects applied.
87
+
88
+ Example - Basic usage
89
+ >>>
90
+ >>> # Defines the effects to apply
91
+ >>> effects = [
92
+ ... ['gain', '-n'], # normalises to 0dB
93
+ ... ['pitch', '5'], # 5 cent pitch shift
94
+ ... ['rate', '8000'], # resample to 8000 Hz
95
+ ... ]
96
+ >>>
97
+ >>> # Generate pseudo wave:
98
+ >>> # normalized, channels first, 2ch, sampling rate 16000, 1 second
99
+ >>> sample_rate = 16000
100
+ >>> waveform = 2 * torch.rand([2, sample_rate * 1]) - 1
101
+ >>> waveform.shape
102
+ torch.Size([2, 16000])
103
+ >>> waveform
104
+ tensor([[ 0.3138, 0.7620, -0.9019, ..., -0.7495, -0.4935, 0.5442],
105
+ [-0.0832, 0.0061, 0.8233, ..., -0.5176, -0.9140, -0.2434]])
106
+ >>>
107
+ >>> # Apply effects
108
+ >>> waveform, sample_rate = apply_effects_tensor(
109
+ ... wave_form, sample_rate, effects, channels_first=True)
110
+ >>>
111
+ >>> # Check the result
112
+ >>> # The new waveform is sampling rate 8000, 1 second.
113
+ >>> # normalization and channel order are preserved
114
+ >>> waveform.shape
115
+ torch.Size([2, 8000])
116
+ >>> waveform
117
+ tensor([[ 0.5054, -0.5518, -0.4800, ..., -0.0076, 0.0096, -0.0110],
118
+ [ 0.1331, 0.0436, -0.3783, ..., -0.0035, 0.0012, 0.0008]])
119
+ >>> sample_rate
120
+ 8000
121
+
122
+ Example - Torchscript-able transform
123
+ >>>
124
+ >>> # Use `apply_effects_tensor` in `torch.nn.Module` and dump it to file,
125
+ >>> # then run sox effect via Torchscript runtime.
126
+ >>>
127
+ >>> class SoxEffectTransform(torch.nn.Module):
128
+ ... effects: List[List[str]]
129
+ ...
130
+ ... def __init__(self, effects: List[List[str]]):
131
+ ... super().__init__()
132
+ ... self.effects = effects
133
+ ...
134
+ ... def forward(self, tensor: torch.Tensor, sample_rate: int):
135
+ ... return sox_effects.apply_effects_tensor(
136
+ ... tensor, sample_rate, self.effects)
137
+ ...
138
+ ...
139
+ >>> # Create transform object
140
+ >>> effects = [
141
+ ... ["lowpass", "-1", "300"], # apply single-pole lowpass filter
142
+ ... ["rate", "8000"], # change sample rate to 8000
143
+ ... ]
144
+ >>> transform = SoxEffectTensorTransform(effects, input_sample_rate)
145
+ >>>
146
+ >>> # Dump it to file and load
147
+ >>> path = 'sox_effect.zip'
148
+ >>> torch.jit.script(trans).save(path)
149
+ >>> transform = torch.jit.load(path)
150
+ >>>
151
+ >>>> # Run transform
152
+ >>> waveform, input_sample_rate = torchaudio.load("input.wav")
153
+ >>> waveform, sample_rate = transform(waveform, input_sample_rate)
154
+ >>> assert sample_rate == 8000
155
+ """
156
+ return sox_ext.apply_effects_tensor(tensor, sample_rate, effects, channels_first)
157
+
158
+
159
+ def apply_effects_file(
160
+ path: str,
161
+ effects: List[List[str]],
162
+ normalize: bool = True,
163
+ channels_first: bool = True,
164
+ format: Optional[str] = None,
165
+ ) -> Tuple[torch.Tensor, int]:
166
+ """Apply sox effects to the audio file and load the resulting data as Tensor
167
+
168
+ .. devices:: CPU
169
+
170
+ .. properties:: TorchScript
171
+
172
+ Note:
173
+ This function works in the way very similar to ``sox`` command, however there are slight
174
+ differences. For example, ``sox`` commnad adds certain effects automatically (such as
175
+ ``rate`` effect after ``speed``, ``pitch`` etc), but this function only applies the given
176
+ effects. Therefore, to actually apply ``speed`` effect, you also need to give ``rate``
177
+ effect with desired sampling rate, because internally, ``speed`` effects only alter sampling
178
+ rate and leave samples untouched.
179
+
180
+ Args:
181
+ path (path-like object):
182
+ Source of audio data.
183
+ effects (List[List[str]]): List of effects.
184
+ normalize (bool, optional):
185
+ When ``True``, this function converts the native sample type to ``float32``.
186
+ Default: ``True``.
187
+
188
+ If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
189
+ integer type.
190
+ This argument has no effect for formats other than integer WAV type.
191
+
192
+ channels_first (bool, optional): When True, the returned Tensor has dimension `[channel, time]`.
193
+ Otherwise, the returned Tensor's dimension is `[time, channel]`.
194
+ format (str or None, optional):
195
+ Override the format detection with the given format.
196
+ Providing the argument might help when libsox can not infer the format
197
+ from header or extension,
198
+
199
+ Returns:
200
+ (Tensor, int): Resulting Tensor and sample rate.
201
+ If ``normalize=True``, the resulting Tensor is always ``float32`` type.
202
+ If ``normalize=False`` and the input audio file is of integer WAV file, then the
203
+ resulting Tensor has corresponding integer type. (Note 24 bit integer type is not supported)
204
+ If ``channels_first=True``, the resulting Tensor has dimension `[channel, time]`,
205
+ otherwise `[time, channel]`.
206
+
207
+ Example - Basic usage
208
+ >>>
209
+ >>> # Defines the effects to apply
210
+ >>> effects = [
211
+ ... ['gain', '-n'], # normalises to 0dB
212
+ ... ['pitch', '5'], # 5 cent pitch shift
213
+ ... ['rate', '8000'], # resample to 8000 Hz
214
+ ... ]
215
+ >>>
216
+ >>> # Apply effects and load data with channels_first=True
217
+ >>> waveform, sample_rate = apply_effects_file("data.wav", effects, channels_first=True)
218
+ >>>
219
+ >>> # Check the result
220
+ >>> waveform.shape
221
+ torch.Size([2, 8000])
222
+ >>> waveform
223
+ tensor([[ 5.1151e-03, 1.8073e-02, 2.2188e-02, ..., 1.0431e-07,
224
+ -1.4761e-07, 1.8114e-07],
225
+ [-2.6924e-03, 2.1860e-03, 1.0650e-02, ..., 6.4122e-07,
226
+ -5.6159e-07, 4.8103e-07]])
227
+ >>> sample_rate
228
+ 8000
229
+
230
+ Example - Apply random speed perturbation to dataset
231
+ >>>
232
+ >>> # Load data from file, apply random speed perturbation
233
+ >>> class RandomPerturbationFile(torch.utils.data.Dataset):
234
+ ... \"\"\"Given flist, apply random speed perturbation
235
+ ...
236
+ ... Suppose all the input files are at least one second long.
237
+ ... \"\"\"
238
+ ... def __init__(self, flist: List[str], sample_rate: int):
239
+ ... super().__init__()
240
+ ... self.flist = flist
241
+ ... self.sample_rate = sample_rate
242
+ ...
243
+ ... def __getitem__(self, index):
244
+ ... speed = 0.5 + 1.5 * random.randn()
245
+ ... effects = [
246
+ ... ['gain', '-n', '-10'], # apply 10 db attenuation
247
+ ... ['remix', '-'], # merge all the channels
248
+ ... ['speed', f'{speed:.5f}'], # duration is now 0.5 ~ 2.0 seconds.
249
+ ... ['rate', f'{self.sample_rate}'],
250
+ ... ['pad', '0', '1.5'], # add 1.5 seconds silence at the end
251
+ ... ['trim', '0', '2'], # get the first 2 seconds
252
+ ... ]
253
+ ... waveform, _ = torchaudio.sox_effects.apply_effects_file(
254
+ ... self.flist[index], effects)
255
+ ... return waveform
256
+ ...
257
+ ... def __len__(self):
258
+ ... return len(self.flist)
259
+ ...
260
+ >>> dataset = RandomPerturbationFile(file_list, sample_rate=8000)
261
+ >>> loader = torch.utils.data.DataLoader(dataset, batch_size=32)
262
+ >>> for batch in loader:
263
+ >>> pass
264
+ """
265
+ if not torch.jit.is_scripting():
266
+ if hasattr(path, "read"):
267
+ raise RuntimeError(
268
+ "apply_effects_file function does not support file-like object. "
269
+ "Please use torchaudio.io.AudioEffector."
270
+ )
271
+ path = os.fspath(path)
272
+ return sox_ext.apply_effects_file(path, effects, normalize, channels_first, format)
@@ -0,0 +1,75 @@
1
+ from ._multi_channel import MVDR, PSD, RTFMVDR, SoudenMVDR
2
+ from ._transforms import (
3
+ AddNoise,
4
+ AmplitudeToDB,
5
+ ComputeDeltas,
6
+ Convolve,
7
+ Deemphasis,
8
+ Fade,
9
+ FFTConvolve,
10
+ FrequencyMasking,
11
+ GriffinLim,
12
+ InverseMelScale,
13
+ InverseSpectrogram,
14
+ LFCC,
15
+ Loudness,
16
+ MelScale,
17
+ MelSpectrogram,
18
+ MFCC,
19
+ MuLawDecoding,
20
+ MuLawEncoding,
21
+ PitchShift,
22
+ Preemphasis,
23
+ Resample,
24
+ RNNTLoss,
25
+ SlidingWindowCmn,
26
+ SpecAugment,
27
+ SpectralCentroid,
28
+ Spectrogram,
29
+ Speed,
30
+ SpeedPerturbation,
31
+ TimeMasking,
32
+ TimeStretch,
33
+ Vad,
34
+ Vol,
35
+ )
36
+
37
+
38
+ __all__ = [
39
+ "AddNoise",
40
+ "AmplitudeToDB",
41
+ "ComputeDeltas",
42
+ "Convolve",
43
+ "Deemphasis",
44
+ "Fade",
45
+ "FFTConvolve",
46
+ "FrequencyMasking",
47
+ "GriffinLim",
48
+ "InverseMelScale",
49
+ "InverseSpectrogram",
50
+ "LFCC",
51
+ "Loudness",
52
+ "MFCC",
53
+ "MVDR",
54
+ "MelScale",
55
+ "MelSpectrogram",
56
+ "MuLawDecoding",
57
+ "MuLawEncoding",
58
+ "PSD",
59
+ "PitchShift",
60
+ "Preemphasis",
61
+ "RNNTLoss",
62
+ "RTFMVDR",
63
+ "Resample",
64
+ "SlidingWindowCmn",
65
+ "SoudenMVDR",
66
+ "SpecAugment",
67
+ "SpectralCentroid",
68
+ "Spectrogram",
69
+ "Speed",
70
+ "SpeedPerturbation",
71
+ "TimeMasking",
72
+ "TimeStretch",
73
+ "Vad",
74
+ "Vol",
75
+ ]