torchaudio 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9488128781f307b653965c253dcacbe246ebaf27bf4e6359b030d9a93bafc1b2
4
- data.tar.gz: 5dc18368bdef8945ecaeb1ad2d60771df98bfdd8c356ad2fcdd2b343b8c02b51
3
+ metadata.gz: 9ed4c14921f1eee18f5e08ddabfae51e09a9b5a7ef408f1dd67fdf7bfe9622fe
4
+ data.tar.gz: 1e37d5b9abed9cab7bf56a8c30a769bc8ff8f8a3e15e78bbb772847c444571b2
5
5
  SHA512:
6
- metadata.gz: ddabdfa32632e9d2af024a0f7b67f2add1b694a9a7ef9036f2a1c2f9338106069e6ec1afaea50a0c6f68a8aaa695e57c492d981fb7b3d704fd28ef420c3e2519
7
- data.tar.gz: 83131487d0566bb957bab2388e888843ca9b108ec95644811fb702c57bfad628bb41a1ea2b9d1b728090230b48767175bc53af7f0601e8acf8c4d299b5a6de33
6
+ metadata.gz: 9ca5436d7e4309dd9659fdce7ee893b122e9da96e9f7b15bf00de5dea32c635e2828a99939f04ef5bf0d9494ab89957829a65002dc3e855fa8a66f54abbbd181
7
+ data.tar.gz: d62b2a137c19d3b24facb11eda5c1b81be5841120b505877b8617bee2b9f183dbe4b4d42a95af27447346a3d48476d7faec48b57cd89b88c0ddc9709f1b5d51b
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ ## 0.2.1 (2021-07-16)
2
+
3
+ - Added `create_dct` method
4
+ - Added `ComputeDeltas`, `Fade`, `MFCC`, and `Vol` transforms
5
+
1
6
  ## 0.2.0 (2021-05-23)
2
7
 
3
8
  - Updated to Rice 4
data/README.md CHANGED
@@ -51,10 +51,16 @@ TorchAudio::Transforms::Spectrogram.new.call(waveform)
51
51
 
52
52
  Supported transforms are:
53
53
 
54
+ - AmplitudeToDB
55
+ - ComputeDeltas
56
+ - Fade
57
+ - MelScale
54
58
  - MelSpectrogram
59
+ - MFCC
55
60
  - MuLawDecoding
56
61
  - MuLawEncoding
57
62
  - Spectrogram
63
+ - Vol
58
64
 
59
65
  ## Functional
60
66
 
@@ -64,7 +70,11 @@ TorchAudio::Functional.lowpass_biquad(waveform, sample_rate, cutoff_freq)
64
70
 
65
71
  Supported functions are:
66
72
 
73
+ - amplitude_to_DB
67
74
  - compute_deltas
75
+ - create_dct
76
+ - create_fb_matrix
77
+ - DB_to_amplitude
68
78
  - dither
69
79
  - gain
70
80
  - highpass_biquad
@@ -22,7 +22,6 @@ apple_clang = RbConfig::CONFIG["CC_VERSION_MESSAGE"] =~ /apple clang/i
22
22
 
23
23
  # check omp first
24
24
  if have_library("omp") || have_library("gomp")
25
- $CXXFLAGS += " -DAT_PARALLEL_OPENMP=1"
26
25
  $CXXFLAGS += " -Xclang" if apple_clang
27
26
  $CXXFLAGS += " -fopenmp"
28
27
  end
data/lib/torchaudio.rb CHANGED
@@ -15,12 +15,16 @@ require "set"
15
15
  require "torchaudio/datasets/utils"
16
16
  require "torchaudio/datasets/yesno"
17
17
  require "torchaudio/functional"
18
+ require "torchaudio/transforms/compute_deltas"
19
+ require "torchaudio/transforms/fade"
18
20
  require "torchaudio/transforms/mel_scale"
19
21
  require "torchaudio/transforms/mel_spectrogram"
20
22
  require "torchaudio/transforms/mu_law_encoding"
21
23
  require "torchaudio/transforms/mu_law_decoding"
22
24
  require "torchaudio/transforms/spectrogram"
23
25
  require "torchaudio/transforms/amplitude_to_db"
26
+ require "torchaudio/transforms/mfcc"
27
+ require "torchaudio/transforms/vol"
24
28
  require "torchaudio/version"
25
29
 
26
30
  module TorchAudio
@@ -12,9 +12,18 @@ module TorchAudio
12
12
  waveform = waveform.reshape(-1, shape[-1])
13
13
 
14
14
  # default values are consistent with librosa.core.spectrum._spectrogram
15
- spec_f = Torch.stft(
16
- waveform, n_fft, hop_length: hop_length, win_length: win_length, window: window, center: true, pad_mode: "reflect", normalized: false, onesided: true
17
- )
15
+ spec_f =
16
+ Torch.stft(
17
+ waveform,
18
+ n_fft,
19
+ hop_length: hop_length,
20
+ win_length: win_length,
21
+ window: window,
22
+ center: true,
23
+ pad_mode: "reflect",
24
+ normalized: false,
25
+ onesided: true
26
+ )
18
27
 
19
28
  # unpack batch
20
29
  spec_f = spec_f.reshape(shape[0..-2] + spec_f.shape[-3..-1])
@@ -240,6 +249,23 @@ module TorchAudio
240
249
  Torch.pow(Torch.pow(10.0, db * 0.1), power) * ref
241
250
  end
242
251
 
252
+ def create_dct(n_mfcc, n_mels, norm: nil)
253
+ n = Torch.arange(n_mels.to_f)
254
+ k = Torch.arange(n_mfcc.to_f).unsqueeze!(1)
255
+ dct = Torch.cos((n + 0.5) * k * Math::PI / n_mels.to_f)
256
+
257
+ if norm.nil?
258
+ dct *= 2.0
259
+ else
260
+ raise ArgumentError, "Invalid DCT norm value" unless norm == :ortho
261
+
262
+ dct[0] *= 1.0 / Math.sqrt(2.0)
263
+ dct *= Math.sqrt(2.0 / n_mels)
264
+ end
265
+
266
+ dct.t
267
+ end
268
+
243
269
  private
244
270
 
245
271
  def _apply_probability_distribution(waveform, density_function: "TPDF")
@@ -0,0 +1,15 @@
1
+ module TorchAudio
2
+ module Transforms
3
+ class ComputeDeltas < Torch::NN::Module
4
+ def initialize(win_length: 5, mode: "replicate")
5
+ super()
6
+ @win_length = win_length
7
+ @mode = mode
8
+ end
9
+
10
+ def forward(specgram)
11
+ F.compute_deltas(specgram, win_length: @win_length, mode: @mode)
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,74 @@
1
+ module TorchAudio
2
+ module Transforms
3
+ class Fade < Torch::NN::Module
4
+ def initialize(fade_in_len: 0, fade_out_len: 0, fade_shape: "linear")
5
+ super()
6
+ @fade_in_len = fade_in_len
7
+ @fade_out_len = fade_out_len
8
+ @fade_shape = fade_shape
9
+ end
10
+
11
+ def forward(waveform)
12
+ waveform_length = waveform.size[-1]
13
+ device = waveform.device
14
+ fade_in(waveform_length).to(device) * fade_out(waveform_length).to(device) * waveform
15
+ end
16
+
17
+ private
18
+
19
+ def fade_in(waveform_length)
20
+ fade = Torch.linspace(0, 1, @fade_in_len)
21
+ ones = Torch.ones(waveform_length - @fade_in_len)
22
+
23
+ if @fade_shape == "linear"
24
+ fade = fade
25
+ end
26
+
27
+ if @fade_shape == "exponential"
28
+ fade = Torch.pow(2, (fade - 1)) * fade
29
+ end
30
+
31
+ if @fade_shape == "logarithmic"
32
+ fade = Torch.log10(0.1 + fade) + 1
33
+ end
34
+
35
+ if @fade_shape == "quarter_sine"
36
+ fade = Torch.sin(fade * Math::PI / 2)
37
+ end
38
+
39
+ if @fade_shape == "half_sine"
40
+ fade = Torch.sin(fade * Math::PI - Math::PI / 2) / 2 + 0.5
41
+ end
42
+
43
+ Torch.cat([fade, ones]).clamp!(0, 1)
44
+ end
45
+
46
+ def fade_out(waveform_length)
47
+ fade = Torch.linspace(0, 1, @fade_out_len)
48
+ ones = Torch.ones(waveform_length - @fade_out_len)
49
+
50
+ if @fade_shape == "linear"
51
+ fade = - fade + 1
52
+ end
53
+
54
+ if @fade_shape == "exponential"
55
+ fade = Torch.pow(2, - fade) * (1 - fade)
56
+ end
57
+
58
+ if @fade_shape == "logarithmic"
59
+ fade = Torch.log10(1.1 - fade) + 1
60
+ end
61
+
62
+ if @fade_shape == "quarter_sine"
63
+ fade = Torch.sin(fade * Math::PI / 2 + Math::PI / 2)
64
+ end
65
+
66
+ if @fade_shape == "half_sine"
67
+ fade = Torch.sin(fade * Math::PI + Math::PI / 2) / 2 + 0.5
68
+ end
69
+
70
+ Torch.cat([ones, fade]).clamp!(0, 1)
71
+ end
72
+ end
73
+ end
74
+ end
@@ -1,6 +1,8 @@
1
1
  module TorchAudio
2
2
  module Transforms
3
3
  class MelSpectrogram < Torch::NN::Module
4
+ attr_reader :n_mels
5
+
4
6
  def initialize(
5
7
  sample_rate: 16000, n_fft: 400, win_length: nil, hop_length: nil, f_min: 0.0,
6
8
  f_max: nil, pad: 0, n_mels: 128, window_fn: Torch.method(:hann_window),
@@ -0,0 +1,43 @@
1
+ module TorchAudio
2
+ module Transforms
3
+ class MFCC < Torch::NN::Module
4
+
5
+ SUPPORTED_DCT_TYPES = [2]
6
+
7
+ def initialize(sample_rate: 16000, n_mfcc: 40, dct_type: 2, norm: :ortho, log_mels: false, melkwargs: {})
8
+ super()
9
+
10
+ raise ArgumentError, "DCT type not supported: #{dct_type}" unless SUPPORTED_DCT_TYPES.include?(dct_type)
11
+
12
+ @sample_rate = sample_rate
13
+ @n_mfcc = n_mfcc
14
+ @dct_type = dct_type
15
+ @norm = norm
16
+ @top_db = 80.0
17
+ @amplitude_to_db = TorchAudio::Transforms::AmplitudeToDB.new(stype: :power, top_db: @top_db)
18
+
19
+ @melspectrogram = TorchAudio::Transforms::MelSpectrogram.new(sample_rate: @sample_rate, **melkwargs)
20
+
21
+ raise ArgumentError, "Cannot select more MFCC coefficients than # mel bins" if @n_mfcc > @melspectrogram.n_mels
22
+
23
+ dct_mat = F.create_dct(@n_mfcc, @melspectrogram.n_mels, norm: @norm)
24
+ register_buffer('dct_mat', dct_mat)
25
+
26
+ @log_mels = log_mels
27
+ end
28
+
29
+ def forward(waveform)
30
+ mel_specgram = @melspectrogram.(waveform)
31
+ if @log_mels
32
+ mel_specgram = Torch.log(mel_specgram + 1e-6)
33
+ else
34
+ mel_specgram = @amplitude_to_db.(mel_specgram)
35
+ end
36
+
37
+ Torch
38
+ .matmul(mel_specgram.transpose(-2, -1), @dct_mat)
39
+ .transpose(-2, -1)
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,31 @@
1
+ module TorchAudio
2
+ module Transforms
3
+ class Vol < Torch::NN::Module
4
+ def initialize(gain, gain_type: "amplitude")
5
+ super()
6
+ @gain = gain
7
+ @gain_type = gain_type
8
+
9
+ if ["amplitude", "power"].include?(gain_type) && gain < 0
10
+ raise ArgumentError, "If gain_type = amplitude or power, gain must be positive."
11
+ end
12
+ end
13
+
14
+ def forward(waveform)
15
+ if @gain_type == "amplitude"
16
+ waveform = waveform * @gain
17
+ end
18
+
19
+ if @gain_type == "db"
20
+ waveform = F.gain(waveform, @gain)
21
+ end
22
+
23
+ if @gain_type == "power"
24
+ waveform = F.gain(waveform, 10 * Math.log10(@gain))
25
+ end
26
+
27
+ Torch.clamp(waveform, -1, 1)
28
+ end
29
+ end
30
+ end
31
+ end
@@ -1,3 +1,3 @@
1
1
  module TorchAudio
2
- VERSION = "0.2.0"
2
+ VERSION = "0.2.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: torchaudio
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-05-23 00:00:00.000000000 Z
11
+ date: 2021-07-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: torch-rb
@@ -64,11 +64,15 @@ files:
64
64
  - lib/torchaudio/datasets/yesno.rb
65
65
  - lib/torchaudio/functional.rb
66
66
  - lib/torchaudio/transforms/amplitude_to_db.rb
67
+ - lib/torchaudio/transforms/compute_deltas.rb
68
+ - lib/torchaudio/transforms/fade.rb
67
69
  - lib/torchaudio/transforms/mel_scale.rb
68
70
  - lib/torchaudio/transforms/mel_spectrogram.rb
71
+ - lib/torchaudio/transforms/mfcc.rb
69
72
  - lib/torchaudio/transforms/mu_law_decoding.rb
70
73
  - lib/torchaudio/transforms/mu_law_encoding.rb
71
74
  - lib/torchaudio/transforms/spectrogram.rb
75
+ - lib/torchaudio/transforms/vol.rb
72
76
  - lib/torchaudio/version.rb
73
77
  homepage: https://github.com/ankane/torchaudio
74
78
  licenses:
@@ -89,7 +93,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
89
93
  - !ruby/object:Gem::Version
90
94
  version: '0'
91
95
  requirements: []
92
- rubygems_version: 3.2.3
96
+ rubygems_version: 3.2.22
93
97
  signing_key:
94
98
  specification_version: 4
95
99
  summary: Data manipulation and transformation for audio signal processing