torchaudio 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +10 -0
- data/ext/torchaudio/extconf.rb +0 -1
- data/lib/torchaudio.rb +4 -0
- data/lib/torchaudio/functional.rb +29 -3
- data/lib/torchaudio/transforms/compute_deltas.rb +15 -0
- data/lib/torchaudio/transforms/fade.rb +74 -0
- data/lib/torchaudio/transforms/mel_spectrogram.rb +2 -0
- data/lib/torchaudio/transforms/mfcc.rb +43 -0
- data/lib/torchaudio/transforms/vol.rb +31 -0
- data/lib/torchaudio/version.rb +1 -1
- metadata +7 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9ed4c14921f1eee18f5e08ddabfae51e09a9b5a7ef408f1dd67fdf7bfe9622fe
|
4
|
+
data.tar.gz: 1e37d5b9abed9cab7bf56a8c30a769bc8ff8f8a3e15e78bbb772847c444571b2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9ca5436d7e4309dd9659fdce7ee893b122e9da96e9f7b15bf00de5dea32c635e2828a99939f04ef5bf0d9494ab89957829a65002dc3e855fa8a66f54abbbd181
|
7
|
+
data.tar.gz: d62b2a137c19d3b24facb11eda5c1b81be5841120b505877b8617bee2b9f183dbe4b4d42a95af27447346a3d48476d7faec48b57cd89b88c0ddc9709f1b5d51b
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -51,10 +51,16 @@ TorchAudio::Transforms::Spectrogram.new.call(waveform)
|
|
51
51
|
|
52
52
|
Supported transforms are:
|
53
53
|
|
54
|
+
- AmplitudeToDB
|
55
|
+
- ComputeDeltas
|
56
|
+
- Fade
|
57
|
+
- MelScale
|
54
58
|
- MelSpectrogram
|
59
|
+
- MFCC
|
55
60
|
- MuLawDecoding
|
56
61
|
- MuLawEncoding
|
57
62
|
- Spectrogram
|
63
|
+
- Vol
|
58
64
|
|
59
65
|
## Functional
|
60
66
|
|
@@ -64,7 +70,11 @@ TorchAudio::Functional.lowpass_biquad(waveform, sample_rate, cutoff_freq)
|
|
64
70
|
|
65
71
|
Supported functions are:
|
66
72
|
|
73
|
+
- amplitude_to_DB
|
67
74
|
- compute_deltas
|
75
|
+
- create_dct
|
76
|
+
- create_fb_matrix
|
77
|
+
- DB_to_amplitude
|
68
78
|
- dither
|
69
79
|
- gain
|
70
80
|
- highpass_biquad
|
data/ext/torchaudio/extconf.rb
CHANGED
@@ -22,7 +22,6 @@ apple_clang = RbConfig::CONFIG["CC_VERSION_MESSAGE"] =~ /apple clang/i
|
|
22
22
|
|
23
23
|
# check omp first
|
24
24
|
if have_library("omp") || have_library("gomp")
|
25
|
-
$CXXFLAGS += " -DAT_PARALLEL_OPENMP=1"
|
26
25
|
$CXXFLAGS += " -Xclang" if apple_clang
|
27
26
|
$CXXFLAGS += " -fopenmp"
|
28
27
|
end
|
data/lib/torchaudio.rb
CHANGED
@@ -15,12 +15,16 @@ require "set"
|
|
15
15
|
require "torchaudio/datasets/utils"
|
16
16
|
require "torchaudio/datasets/yesno"
|
17
17
|
require "torchaudio/functional"
|
18
|
+
require "torchaudio/transforms/compute_deltas"
|
19
|
+
require "torchaudio/transforms/fade"
|
18
20
|
require "torchaudio/transforms/mel_scale"
|
19
21
|
require "torchaudio/transforms/mel_spectrogram"
|
20
22
|
require "torchaudio/transforms/mu_law_encoding"
|
21
23
|
require "torchaudio/transforms/mu_law_decoding"
|
22
24
|
require "torchaudio/transforms/spectrogram"
|
23
25
|
require "torchaudio/transforms/amplitude_to_db"
|
26
|
+
require "torchaudio/transforms/mfcc"
|
27
|
+
require "torchaudio/transforms/vol"
|
24
28
|
require "torchaudio/version"
|
25
29
|
|
26
30
|
module TorchAudio
|
@@ -12,9 +12,18 @@ module TorchAudio
|
|
12
12
|
waveform = waveform.reshape(-1, shape[-1])
|
13
13
|
|
14
14
|
# default values are consistent with librosa.core.spectrum._spectrogram
|
15
|
-
spec_f =
|
16
|
-
|
17
|
-
|
15
|
+
spec_f =
|
16
|
+
Torch.stft(
|
17
|
+
waveform,
|
18
|
+
n_fft,
|
19
|
+
hop_length: hop_length,
|
20
|
+
win_length: win_length,
|
21
|
+
window: window,
|
22
|
+
center: true,
|
23
|
+
pad_mode: "reflect",
|
24
|
+
normalized: false,
|
25
|
+
onesided: true
|
26
|
+
)
|
18
27
|
|
19
28
|
# unpack batch
|
20
29
|
spec_f = spec_f.reshape(shape[0..-2] + spec_f.shape[-3..-1])
|
@@ -240,6 +249,23 @@ module TorchAudio
|
|
240
249
|
Torch.pow(Torch.pow(10.0, db * 0.1), power) * ref
|
241
250
|
end
|
242
251
|
|
252
|
+
def create_dct(n_mfcc, n_mels, norm: nil)
|
253
|
+
n = Torch.arange(n_mels.to_f)
|
254
|
+
k = Torch.arange(n_mfcc.to_f).unsqueeze!(1)
|
255
|
+
dct = Torch.cos((n + 0.5) * k * Math::PI / n_mels.to_f)
|
256
|
+
|
257
|
+
if norm.nil?
|
258
|
+
dct *= 2.0
|
259
|
+
else
|
260
|
+
raise ArgumentError, "Invalid DCT norm value" unless norm == :ortho
|
261
|
+
|
262
|
+
dct[0] *= 1.0 / Math.sqrt(2.0)
|
263
|
+
dct *= Math.sqrt(2.0 / n_mels)
|
264
|
+
end
|
265
|
+
|
266
|
+
dct.t
|
267
|
+
end
|
268
|
+
|
243
269
|
private
|
244
270
|
|
245
271
|
def _apply_probability_distribution(waveform, density_function: "TPDF")
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module TorchAudio
|
2
|
+
module Transforms
|
3
|
+
class ComputeDeltas < Torch::NN::Module
|
4
|
+
def initialize(win_length: 5, mode: "replicate")
|
5
|
+
super()
|
6
|
+
@win_length = win_length
|
7
|
+
@mode = mode
|
8
|
+
end
|
9
|
+
|
10
|
+
def forward(specgram)
|
11
|
+
F.compute_deltas(specgram, win_length: @win_length, mode: @mode)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
module TorchAudio
|
2
|
+
module Transforms
|
3
|
+
class Fade < Torch::NN::Module
|
4
|
+
def initialize(fade_in_len: 0, fade_out_len: 0, fade_shape: "linear")
|
5
|
+
super()
|
6
|
+
@fade_in_len = fade_in_len
|
7
|
+
@fade_out_len = fade_out_len
|
8
|
+
@fade_shape = fade_shape
|
9
|
+
end
|
10
|
+
|
11
|
+
def forward(waveform)
|
12
|
+
waveform_length = waveform.size[-1]
|
13
|
+
device = waveform.device
|
14
|
+
fade_in(waveform_length).to(device) * fade_out(waveform_length).to(device) * waveform
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def fade_in(waveform_length)
|
20
|
+
fade = Torch.linspace(0, 1, @fade_in_len)
|
21
|
+
ones = Torch.ones(waveform_length - @fade_in_len)
|
22
|
+
|
23
|
+
if @fade_shape == "linear"
|
24
|
+
fade = fade
|
25
|
+
end
|
26
|
+
|
27
|
+
if @fade_shape == "exponential"
|
28
|
+
fade = Torch.pow(2, (fade - 1)) * fade
|
29
|
+
end
|
30
|
+
|
31
|
+
if @fade_shape == "logarithmic"
|
32
|
+
fade = Torch.log10(0.1 + fade) + 1
|
33
|
+
end
|
34
|
+
|
35
|
+
if @fade_shape == "quarter_sine"
|
36
|
+
fade = Torch.sin(fade * Math::PI / 2)
|
37
|
+
end
|
38
|
+
|
39
|
+
if @fade_shape == "half_sine"
|
40
|
+
fade = Torch.sin(fade * Math::PI - Math::PI / 2) / 2 + 0.5
|
41
|
+
end
|
42
|
+
|
43
|
+
Torch.cat([fade, ones]).clamp!(0, 1)
|
44
|
+
end
|
45
|
+
|
46
|
+
def fade_out(waveform_length)
|
47
|
+
fade = Torch.linspace(0, 1, @fade_out_len)
|
48
|
+
ones = Torch.ones(waveform_length - @fade_out_len)
|
49
|
+
|
50
|
+
if @fade_shape == "linear"
|
51
|
+
fade = - fade + 1
|
52
|
+
end
|
53
|
+
|
54
|
+
if @fade_shape == "exponential"
|
55
|
+
fade = Torch.pow(2, - fade) * (1 - fade)
|
56
|
+
end
|
57
|
+
|
58
|
+
if @fade_shape == "logarithmic"
|
59
|
+
fade = Torch.log10(1.1 - fade) + 1
|
60
|
+
end
|
61
|
+
|
62
|
+
if @fade_shape == "quarter_sine"
|
63
|
+
fade = Torch.sin(fade * Math::PI / 2 + Math::PI / 2)
|
64
|
+
end
|
65
|
+
|
66
|
+
if @fade_shape == "half_sine"
|
67
|
+
fade = Torch.sin(fade * Math::PI + Math::PI / 2) / 2 + 0.5
|
68
|
+
end
|
69
|
+
|
70
|
+
Torch.cat([ones, fade]).clamp!(0, 1)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -1,6 +1,8 @@
|
|
1
1
|
module TorchAudio
|
2
2
|
module Transforms
|
3
3
|
class MelSpectrogram < Torch::NN::Module
|
4
|
+
attr_reader :n_mels
|
5
|
+
|
4
6
|
def initialize(
|
5
7
|
sample_rate: 16000, n_fft: 400, win_length: nil, hop_length: nil, f_min: 0.0,
|
6
8
|
f_max: nil, pad: 0, n_mels: 128, window_fn: Torch.method(:hann_window),
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module TorchAudio
|
2
|
+
module Transforms
|
3
|
+
class MFCC < Torch::NN::Module
|
4
|
+
|
5
|
+
SUPPORTED_DCT_TYPES = [2]
|
6
|
+
|
7
|
+
def initialize(sample_rate: 16000, n_mfcc: 40, dct_type: 2, norm: :ortho, log_mels: false, melkwargs: {})
|
8
|
+
super()
|
9
|
+
|
10
|
+
raise ArgumentError, "DCT type not supported: #{dct_type}" unless SUPPORTED_DCT_TYPES.include?(dct_type)
|
11
|
+
|
12
|
+
@sample_rate = sample_rate
|
13
|
+
@n_mfcc = n_mfcc
|
14
|
+
@dct_type = dct_type
|
15
|
+
@norm = norm
|
16
|
+
@top_db = 80.0
|
17
|
+
@amplitude_to_db = TorchAudio::Transforms::AmplitudeToDB.new(stype: :power, top_db: @top_db)
|
18
|
+
|
19
|
+
@melspectrogram = TorchAudio::Transforms::MelSpectrogram.new(sample_rate: @sample_rate, **melkwargs)
|
20
|
+
|
21
|
+
raise ArgumentError, "Cannot select more MFCC coefficients than # mel bins" if @n_mfcc > @melspectrogram.n_mels
|
22
|
+
|
23
|
+
dct_mat = F.create_dct(@n_mfcc, @melspectrogram.n_mels, norm: @norm)
|
24
|
+
register_buffer('dct_mat', dct_mat)
|
25
|
+
|
26
|
+
@log_mels = log_mels
|
27
|
+
end
|
28
|
+
|
29
|
+
def forward(waveform)
|
30
|
+
mel_specgram = @melspectrogram.(waveform)
|
31
|
+
if @log_mels
|
32
|
+
mel_specgram = Torch.log(mel_specgram + 1e-6)
|
33
|
+
else
|
34
|
+
mel_specgram = @amplitude_to_db.(mel_specgram)
|
35
|
+
end
|
36
|
+
|
37
|
+
Torch
|
38
|
+
.matmul(mel_specgram.transpose(-2, -1), @dct_mat)
|
39
|
+
.transpose(-2, -1)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module TorchAudio
|
2
|
+
module Transforms
|
3
|
+
class Vol < Torch::NN::Module
|
4
|
+
def initialize(gain, gain_type: "amplitude")
|
5
|
+
super()
|
6
|
+
@gain = gain
|
7
|
+
@gain_type = gain_type
|
8
|
+
|
9
|
+
if ["amplitude", "power"].include?(gain_type) && gain < 0
|
10
|
+
raise ArgumentError, "If gain_type = amplitude or power, gain must be positive."
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def forward(waveform)
|
15
|
+
if @gain_type == "amplitude"
|
16
|
+
waveform = waveform * @gain
|
17
|
+
end
|
18
|
+
|
19
|
+
if @gain_type == "db"
|
20
|
+
waveform = F.gain(waveform, @gain)
|
21
|
+
end
|
22
|
+
|
23
|
+
if @gain_type == "power"
|
24
|
+
waveform = F.gain(waveform, 10 * Math.log10(@gain))
|
25
|
+
end
|
26
|
+
|
27
|
+
Torch.clamp(waveform, -1, 1)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
data/lib/torchaudio/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: torchaudio
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-07-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: torch-rb
|
@@ -64,11 +64,15 @@ files:
|
|
64
64
|
- lib/torchaudio/datasets/yesno.rb
|
65
65
|
- lib/torchaudio/functional.rb
|
66
66
|
- lib/torchaudio/transforms/amplitude_to_db.rb
|
67
|
+
- lib/torchaudio/transforms/compute_deltas.rb
|
68
|
+
- lib/torchaudio/transforms/fade.rb
|
67
69
|
- lib/torchaudio/transforms/mel_scale.rb
|
68
70
|
- lib/torchaudio/transforms/mel_spectrogram.rb
|
71
|
+
- lib/torchaudio/transforms/mfcc.rb
|
69
72
|
- lib/torchaudio/transforms/mu_law_decoding.rb
|
70
73
|
- lib/torchaudio/transforms/mu_law_encoding.rb
|
71
74
|
- lib/torchaudio/transforms/spectrogram.rb
|
75
|
+
- lib/torchaudio/transforms/vol.rb
|
72
76
|
- lib/torchaudio/version.rb
|
73
77
|
homepage: https://github.com/ankane/torchaudio
|
74
78
|
licenses:
|
@@ -89,7 +93,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
89
93
|
- !ruby/object:Gem::Version
|
90
94
|
version: '0'
|
91
95
|
requirements: []
|
92
|
-
rubygems_version: 3.2.
|
96
|
+
rubygems_version: 3.2.22
|
93
97
|
signing_key:
|
94
98
|
specification_version: 4
|
95
99
|
summary: Data manipulation and transformation for audio signal processing
|