torchaudio 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +10 -0
- data/ext/torchaudio/extconf.rb +0 -1
- data/lib/torchaudio.rb +4 -0
- data/lib/torchaudio/functional.rb +29 -3
- data/lib/torchaudio/transforms/compute_deltas.rb +15 -0
- data/lib/torchaudio/transforms/fade.rb +74 -0
- data/lib/torchaudio/transforms/mel_spectrogram.rb +2 -0
- data/lib/torchaudio/transforms/mfcc.rb +43 -0
- data/lib/torchaudio/transforms/vol.rb +31 -0
- data/lib/torchaudio/version.rb +1 -1
- metadata +7 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9ed4c14921f1eee18f5e08ddabfae51e09a9b5a7ef408f1dd67fdf7bfe9622fe
|
4
|
+
data.tar.gz: 1e37d5b9abed9cab7bf56a8c30a769bc8ff8f8a3e15e78bbb772847c444571b2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9ca5436d7e4309dd9659fdce7ee893b122e9da96e9f7b15bf00de5dea32c635e2828a99939f04ef5bf0d9494ab89957829a65002dc3e855fa8a66f54abbbd181
|
7
|
+
data.tar.gz: d62b2a137c19d3b24facb11eda5c1b81be5841120b505877b8617bee2b9f183dbe4b4d42a95af27447346a3d48476d7faec48b57cd89b88c0ddc9709f1b5d51b
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -51,10 +51,16 @@ TorchAudio::Transforms::Spectrogram.new.call(waveform)
|
|
51
51
|
|
52
52
|
Supported transforms are:
|
53
53
|
|
54
|
+
- AmplitudeToDB
|
55
|
+
- ComputeDeltas
|
56
|
+
- Fade
|
57
|
+
- MelScale
|
54
58
|
- MelSpectrogram
|
59
|
+
- MFCC
|
55
60
|
- MuLawDecoding
|
56
61
|
- MuLawEncoding
|
57
62
|
- Spectrogram
|
63
|
+
- Vol
|
58
64
|
|
59
65
|
## Functional
|
60
66
|
|
@@ -64,7 +70,11 @@ TorchAudio::Functional.lowpass_biquad(waveform, sample_rate, cutoff_freq)
|
|
64
70
|
|
65
71
|
Supported functions are:
|
66
72
|
|
73
|
+
- amplitude_to_DB
|
67
74
|
- compute_deltas
|
75
|
+
- create_dct
|
76
|
+
- create_fb_matrix
|
77
|
+
- DB_to_amplitude
|
68
78
|
- dither
|
69
79
|
- gain
|
70
80
|
- highpass_biquad
|
data/ext/torchaudio/extconf.rb
CHANGED
@@ -22,7 +22,6 @@ apple_clang = RbConfig::CONFIG["CC_VERSION_MESSAGE"] =~ /apple clang/i
|
|
22
22
|
|
23
23
|
# check omp first
|
24
24
|
if have_library("omp") || have_library("gomp")
|
25
|
-
$CXXFLAGS += " -DAT_PARALLEL_OPENMP=1"
|
26
25
|
$CXXFLAGS += " -Xclang" if apple_clang
|
27
26
|
$CXXFLAGS += " -fopenmp"
|
28
27
|
end
|
data/lib/torchaudio.rb
CHANGED
@@ -15,12 +15,16 @@ require "set"
|
|
15
15
|
require "torchaudio/datasets/utils"
|
16
16
|
require "torchaudio/datasets/yesno"
|
17
17
|
require "torchaudio/functional"
|
18
|
+
require "torchaudio/transforms/compute_deltas"
|
19
|
+
require "torchaudio/transforms/fade"
|
18
20
|
require "torchaudio/transforms/mel_scale"
|
19
21
|
require "torchaudio/transforms/mel_spectrogram"
|
20
22
|
require "torchaudio/transforms/mu_law_encoding"
|
21
23
|
require "torchaudio/transforms/mu_law_decoding"
|
22
24
|
require "torchaudio/transforms/spectrogram"
|
23
25
|
require "torchaudio/transforms/amplitude_to_db"
|
26
|
+
require "torchaudio/transforms/mfcc"
|
27
|
+
require "torchaudio/transforms/vol"
|
24
28
|
require "torchaudio/version"
|
25
29
|
|
26
30
|
module TorchAudio
|
@@ -12,9 +12,18 @@ module TorchAudio
|
|
12
12
|
waveform = waveform.reshape(-1, shape[-1])
|
13
13
|
|
14
14
|
# default values are consistent with librosa.core.spectrum._spectrogram
|
15
|
-
spec_f =
|
16
|
-
|
17
|
-
|
15
|
+
spec_f =
|
16
|
+
Torch.stft(
|
17
|
+
waveform,
|
18
|
+
n_fft,
|
19
|
+
hop_length: hop_length,
|
20
|
+
win_length: win_length,
|
21
|
+
window: window,
|
22
|
+
center: true,
|
23
|
+
pad_mode: "reflect",
|
24
|
+
normalized: false,
|
25
|
+
onesided: true
|
26
|
+
)
|
18
27
|
|
19
28
|
# unpack batch
|
20
29
|
spec_f = spec_f.reshape(shape[0..-2] + spec_f.shape[-3..-1])
|
@@ -240,6 +249,23 @@ module TorchAudio
|
|
240
249
|
Torch.pow(Torch.pow(10.0, db * 0.1), power) * ref
|
241
250
|
end
|
242
251
|
|
252
|
+
def create_dct(n_mfcc, n_mels, norm: nil)
|
253
|
+
n = Torch.arange(n_mels.to_f)
|
254
|
+
k = Torch.arange(n_mfcc.to_f).unsqueeze!(1)
|
255
|
+
dct = Torch.cos((n + 0.5) * k * Math::PI / n_mels.to_f)
|
256
|
+
|
257
|
+
if norm.nil?
|
258
|
+
dct *= 2.0
|
259
|
+
else
|
260
|
+
raise ArgumentError, "Invalid DCT norm value" unless norm == :ortho
|
261
|
+
|
262
|
+
dct[0] *= 1.0 / Math.sqrt(2.0)
|
263
|
+
dct *= Math.sqrt(2.0 / n_mels)
|
264
|
+
end
|
265
|
+
|
266
|
+
dct.t
|
267
|
+
end
|
268
|
+
|
243
269
|
private
|
244
270
|
|
245
271
|
def _apply_probability_distribution(waveform, density_function: "TPDF")
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module TorchAudio
|
2
|
+
module Transforms
|
3
|
+
class ComputeDeltas < Torch::NN::Module
|
4
|
+
def initialize(win_length: 5, mode: "replicate")
|
5
|
+
super()
|
6
|
+
@win_length = win_length
|
7
|
+
@mode = mode
|
8
|
+
end
|
9
|
+
|
10
|
+
def forward(specgram)
|
11
|
+
F.compute_deltas(specgram, win_length: @win_length, mode: @mode)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
module TorchAudio
|
2
|
+
module Transforms
|
3
|
+
class Fade < Torch::NN::Module
|
4
|
+
def initialize(fade_in_len: 0, fade_out_len: 0, fade_shape: "linear")
|
5
|
+
super()
|
6
|
+
@fade_in_len = fade_in_len
|
7
|
+
@fade_out_len = fade_out_len
|
8
|
+
@fade_shape = fade_shape
|
9
|
+
end
|
10
|
+
|
11
|
+
def forward(waveform)
|
12
|
+
waveform_length = waveform.size[-1]
|
13
|
+
device = waveform.device
|
14
|
+
fade_in(waveform_length).to(device) * fade_out(waveform_length).to(device) * waveform
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def fade_in(waveform_length)
|
20
|
+
fade = Torch.linspace(0, 1, @fade_in_len)
|
21
|
+
ones = Torch.ones(waveform_length - @fade_in_len)
|
22
|
+
|
23
|
+
if @fade_shape == "linear"
|
24
|
+
fade = fade
|
25
|
+
end
|
26
|
+
|
27
|
+
if @fade_shape == "exponential"
|
28
|
+
fade = Torch.pow(2, (fade - 1)) * fade
|
29
|
+
end
|
30
|
+
|
31
|
+
if @fade_shape == "logarithmic"
|
32
|
+
fade = Torch.log10(0.1 + fade) + 1
|
33
|
+
end
|
34
|
+
|
35
|
+
if @fade_shape == "quarter_sine"
|
36
|
+
fade = Torch.sin(fade * Math::PI / 2)
|
37
|
+
end
|
38
|
+
|
39
|
+
if @fade_shape == "half_sine"
|
40
|
+
fade = Torch.sin(fade * Math::PI - Math::PI / 2) / 2 + 0.5
|
41
|
+
end
|
42
|
+
|
43
|
+
Torch.cat([fade, ones]).clamp!(0, 1)
|
44
|
+
end
|
45
|
+
|
46
|
+
def fade_out(waveform_length)
|
47
|
+
fade = Torch.linspace(0, 1, @fade_out_len)
|
48
|
+
ones = Torch.ones(waveform_length - @fade_out_len)
|
49
|
+
|
50
|
+
if @fade_shape == "linear"
|
51
|
+
fade = - fade + 1
|
52
|
+
end
|
53
|
+
|
54
|
+
if @fade_shape == "exponential"
|
55
|
+
fade = Torch.pow(2, - fade) * (1 - fade)
|
56
|
+
end
|
57
|
+
|
58
|
+
if @fade_shape == "logarithmic"
|
59
|
+
fade = Torch.log10(1.1 - fade) + 1
|
60
|
+
end
|
61
|
+
|
62
|
+
if @fade_shape == "quarter_sine"
|
63
|
+
fade = Torch.sin(fade * Math::PI / 2 + Math::PI / 2)
|
64
|
+
end
|
65
|
+
|
66
|
+
if @fade_shape == "half_sine"
|
67
|
+
fade = Torch.sin(fade * Math::PI + Math::PI / 2) / 2 + 0.5
|
68
|
+
end
|
69
|
+
|
70
|
+
Torch.cat([ones, fade]).clamp!(0, 1)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -1,6 +1,8 @@
|
|
1
1
|
module TorchAudio
|
2
2
|
module Transforms
|
3
3
|
class MelSpectrogram < Torch::NN::Module
|
4
|
+
attr_reader :n_mels
|
5
|
+
|
4
6
|
def initialize(
|
5
7
|
sample_rate: 16000, n_fft: 400, win_length: nil, hop_length: nil, f_min: 0.0,
|
6
8
|
f_max: nil, pad: 0, n_mels: 128, window_fn: Torch.method(:hann_window),
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module TorchAudio
|
2
|
+
module Transforms
|
3
|
+
class MFCC < Torch::NN::Module
|
4
|
+
|
5
|
+
SUPPORTED_DCT_TYPES = [2]
|
6
|
+
|
7
|
+
def initialize(sample_rate: 16000, n_mfcc: 40, dct_type: 2, norm: :ortho, log_mels: false, melkwargs: {})
|
8
|
+
super()
|
9
|
+
|
10
|
+
raise ArgumentError, "DCT type not supported: #{dct_type}" unless SUPPORTED_DCT_TYPES.include?(dct_type)
|
11
|
+
|
12
|
+
@sample_rate = sample_rate
|
13
|
+
@n_mfcc = n_mfcc
|
14
|
+
@dct_type = dct_type
|
15
|
+
@norm = norm
|
16
|
+
@top_db = 80.0
|
17
|
+
@amplitude_to_db = TorchAudio::Transforms::AmplitudeToDB.new(stype: :power, top_db: @top_db)
|
18
|
+
|
19
|
+
@melspectrogram = TorchAudio::Transforms::MelSpectrogram.new(sample_rate: @sample_rate, **melkwargs)
|
20
|
+
|
21
|
+
raise ArgumentError, "Cannot select more MFCC coefficients than # mel bins" if @n_mfcc > @melspectrogram.n_mels
|
22
|
+
|
23
|
+
dct_mat = F.create_dct(@n_mfcc, @melspectrogram.n_mels, norm: @norm)
|
24
|
+
register_buffer('dct_mat', dct_mat)
|
25
|
+
|
26
|
+
@log_mels = log_mels
|
27
|
+
end
|
28
|
+
|
29
|
+
def forward(waveform)
|
30
|
+
mel_specgram = @melspectrogram.(waveform)
|
31
|
+
if @log_mels
|
32
|
+
mel_specgram = Torch.log(mel_specgram + 1e-6)
|
33
|
+
else
|
34
|
+
mel_specgram = @amplitude_to_db.(mel_specgram)
|
35
|
+
end
|
36
|
+
|
37
|
+
Torch
|
38
|
+
.matmul(mel_specgram.transpose(-2, -1), @dct_mat)
|
39
|
+
.transpose(-2, -1)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module TorchAudio
|
2
|
+
module Transforms
|
3
|
+
class Vol < Torch::NN::Module
|
4
|
+
def initialize(gain, gain_type: "amplitude")
|
5
|
+
super()
|
6
|
+
@gain = gain
|
7
|
+
@gain_type = gain_type
|
8
|
+
|
9
|
+
if ["amplitude", "power"].include?(gain_type) && gain < 0
|
10
|
+
raise ArgumentError, "If gain_type = amplitude or power, gain must be positive."
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def forward(waveform)
|
15
|
+
if @gain_type == "amplitude"
|
16
|
+
waveform = waveform * @gain
|
17
|
+
end
|
18
|
+
|
19
|
+
if @gain_type == "db"
|
20
|
+
waveform = F.gain(waveform, @gain)
|
21
|
+
end
|
22
|
+
|
23
|
+
if @gain_type == "power"
|
24
|
+
waveform = F.gain(waveform, 10 * Math.log10(@gain))
|
25
|
+
end
|
26
|
+
|
27
|
+
Torch.clamp(waveform, -1, 1)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
data/lib/torchaudio/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: torchaudio
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-07-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: torch-rb
|
@@ -64,11 +64,15 @@ files:
|
|
64
64
|
- lib/torchaudio/datasets/yesno.rb
|
65
65
|
- lib/torchaudio/functional.rb
|
66
66
|
- lib/torchaudio/transforms/amplitude_to_db.rb
|
67
|
+
- lib/torchaudio/transforms/compute_deltas.rb
|
68
|
+
- lib/torchaudio/transforms/fade.rb
|
67
69
|
- lib/torchaudio/transforms/mel_scale.rb
|
68
70
|
- lib/torchaudio/transforms/mel_spectrogram.rb
|
71
|
+
- lib/torchaudio/transforms/mfcc.rb
|
69
72
|
- lib/torchaudio/transforms/mu_law_decoding.rb
|
70
73
|
- lib/torchaudio/transforms/mu_law_encoding.rb
|
71
74
|
- lib/torchaudio/transforms/spectrogram.rb
|
75
|
+
- lib/torchaudio/transforms/vol.rb
|
72
76
|
- lib/torchaudio/version.rb
|
73
77
|
homepage: https://github.com/ankane/torchaudio
|
74
78
|
licenses:
|
@@ -89,7 +93,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
89
93
|
- !ruby/object:Gem::Version
|
90
94
|
version: '0'
|
91
95
|
requirements: []
|
92
|
-
rubygems_version: 3.2.
|
96
|
+
rubygems_version: 3.2.22
|
93
97
|
signing_key:
|
94
98
|
specification_version: 4
|
95
99
|
summary: Data manipulation and transformation for audio signal processing
|