torchaudio 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9488128781f307b653965c253dcacbe246ebaf27bf4e6359b030d9a93bafc1b2
4
- data.tar.gz: 5dc18368bdef8945ecaeb1ad2d60771df98bfdd8c356ad2fcdd2b343b8c02b51
3
+ metadata.gz: 9ed4c14921f1eee18f5e08ddabfae51e09a9b5a7ef408f1dd67fdf7bfe9622fe
4
+ data.tar.gz: 1e37d5b9abed9cab7bf56a8c30a769bc8ff8f8a3e15e78bbb772847c444571b2
5
5
  SHA512:
6
- metadata.gz: ddabdfa32632e9d2af024a0f7b67f2add1b694a9a7ef9036f2a1c2f9338106069e6ec1afaea50a0c6f68a8aaa695e57c492d981fb7b3d704fd28ef420c3e2519
7
- data.tar.gz: 83131487d0566bb957bab2388e888843ca9b108ec95644811fb702c57bfad628bb41a1ea2b9d1b728090230b48767175bc53af7f0601e8acf8c4d299b5a6de33
6
+ metadata.gz: 9ca5436d7e4309dd9659fdce7ee893b122e9da96e9f7b15bf00de5dea32c635e2828a99939f04ef5bf0d9494ab89957829a65002dc3e855fa8a66f54abbbd181
7
+ data.tar.gz: d62b2a137c19d3b24facb11eda5c1b81be5841120b505877b8617bee2b9f183dbe4b4d42a95af27447346a3d48476d7faec48b57cd89b88c0ddc9709f1b5d51b
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ ## 0.2.1 (2021-07-16)
2
+
3
+ - Added `create_dct` method
4
+ - Added `ComputeDeltas`, `Fade`, `MFCC`, and `Vol` transforms
5
+
1
6
  ## 0.2.0 (2021-05-23)
2
7
 
3
8
  - Updated to Rice 4
data/README.md CHANGED
@@ -51,10 +51,16 @@ TorchAudio::Transforms::Spectrogram.new.call(waveform)
51
51
 
52
52
  Supported transforms are:
53
53
 
54
+ - AmplitudeToDB
55
+ - ComputeDeltas
56
+ - Fade
57
+ - MelScale
54
58
  - MelSpectrogram
59
+ - MFCC
55
60
  - MuLawDecoding
56
61
  - MuLawEncoding
57
62
  - Spectrogram
63
+ - Vol
58
64
 
59
65
  ## Functional
60
66
 
@@ -64,7 +70,11 @@ TorchAudio::Functional.lowpass_biquad(waveform, sample_rate, cutoff_freq)
64
70
 
65
71
  Supported functions are:
66
72
 
73
+ - amplitude_to_DB
67
74
  - compute_deltas
75
+ - create_dct
76
+ - create_fb_matrix
77
+ - DB_to_amplitude
68
78
  - dither
69
79
  - gain
70
80
  - highpass_biquad
@@ -22,7 +22,6 @@ apple_clang = RbConfig::CONFIG["CC_VERSION_MESSAGE"] =~ /apple clang/i
22
22
 
23
23
  # check omp first
24
24
  if have_library("omp") || have_library("gomp")
25
- $CXXFLAGS += " -DAT_PARALLEL_OPENMP=1"
26
25
  $CXXFLAGS += " -Xclang" if apple_clang
27
26
  $CXXFLAGS += " -fopenmp"
28
27
  end
data/lib/torchaudio.rb CHANGED
@@ -15,12 +15,16 @@ require "set"
15
15
  require "torchaudio/datasets/utils"
16
16
  require "torchaudio/datasets/yesno"
17
17
  require "torchaudio/functional"
18
+ require "torchaudio/transforms/compute_deltas"
19
+ require "torchaudio/transforms/fade"
18
20
  require "torchaudio/transforms/mel_scale"
19
21
  require "torchaudio/transforms/mel_spectrogram"
20
22
  require "torchaudio/transforms/mu_law_encoding"
21
23
  require "torchaudio/transforms/mu_law_decoding"
22
24
  require "torchaudio/transforms/spectrogram"
23
25
  require "torchaudio/transforms/amplitude_to_db"
26
+ require "torchaudio/transforms/mfcc"
27
+ require "torchaudio/transforms/vol"
24
28
  require "torchaudio/version"
25
29
 
26
30
  module TorchAudio
@@ -12,9 +12,18 @@ module TorchAudio
12
12
  waveform = waveform.reshape(-1, shape[-1])
13
13
 
14
14
  # default values are consistent with librosa.core.spectrum._spectrogram
15
- spec_f = Torch.stft(
16
- waveform, n_fft, hop_length: hop_length, win_length: win_length, window: window, center: true, pad_mode: "reflect", normalized: false, onesided: true
17
- )
15
+ spec_f =
16
+ Torch.stft(
17
+ waveform,
18
+ n_fft,
19
+ hop_length: hop_length,
20
+ win_length: win_length,
21
+ window: window,
22
+ center: true,
23
+ pad_mode: "reflect",
24
+ normalized: false,
25
+ onesided: true
26
+ )
18
27
 
19
28
  # unpack batch
20
29
  spec_f = spec_f.reshape(shape[0..-2] + spec_f.shape[-3..-1])
@@ -240,6 +249,23 @@ module TorchAudio
240
249
  Torch.pow(Torch.pow(10.0, db * 0.1), power) * ref
241
250
  end
242
251
 
252
+ def create_dct(n_mfcc, n_mels, norm: nil)
253
+ n = Torch.arange(n_mels.to_f)
254
+ k = Torch.arange(n_mfcc.to_f).unsqueeze!(1)
255
+ dct = Torch.cos((n + 0.5) * k * Math::PI / n_mels.to_f)
256
+
257
+ if norm.nil?
258
+ dct *= 2.0
259
+ else
260
+ raise ArgumentError, "Invalid DCT norm value" unless norm == :ortho
261
+
262
+ dct[0] *= 1.0 / Math.sqrt(2.0)
263
+ dct *= Math.sqrt(2.0 / n_mels)
264
+ end
265
+
266
+ dct.t
267
+ end
268
+
243
269
  private
244
270
 
245
271
  def _apply_probability_distribution(waveform, density_function: "TPDF")
@@ -0,0 +1,15 @@
1
+ module TorchAudio
2
+ module Transforms
3
+ class ComputeDeltas < Torch::NN::Module
4
+ def initialize(win_length: 5, mode: "replicate")
5
+ super()
6
+ @win_length = win_length
7
+ @mode = mode
8
+ end
9
+
10
+ def forward(specgram)
11
+ F.compute_deltas(specgram, win_length: @win_length, mode: @mode)
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,74 @@
1
+ module TorchAudio
2
+ module Transforms
3
+ class Fade < Torch::NN::Module
4
+ def initialize(fade_in_len: 0, fade_out_len: 0, fade_shape: "linear")
5
+ super()
6
+ @fade_in_len = fade_in_len
7
+ @fade_out_len = fade_out_len
8
+ @fade_shape = fade_shape
9
+ end
10
+
11
+ def forward(waveform)
12
+ waveform_length = waveform.size[-1]
13
+ device = waveform.device
14
+ fade_in(waveform_length).to(device) * fade_out(waveform_length).to(device) * waveform
15
+ end
16
+
17
+ private
18
+
19
+ def fade_in(waveform_length)
20
+ fade = Torch.linspace(0, 1, @fade_in_len)
21
+ ones = Torch.ones(waveform_length - @fade_in_len)
22
+
23
+ if @fade_shape == "linear"
24
+ fade = fade
25
+ end
26
+
27
+ if @fade_shape == "exponential"
28
+ fade = Torch.pow(2, (fade - 1)) * fade
29
+ end
30
+
31
+ if @fade_shape == "logarithmic"
32
+ fade = Torch.log10(0.1 + fade) + 1
33
+ end
34
+
35
+ if @fade_shape == "quarter_sine"
36
+ fade = Torch.sin(fade * Math::PI / 2)
37
+ end
38
+
39
+ if @fade_shape == "half_sine"
40
+ fade = Torch.sin(fade * Math::PI - Math::PI / 2) / 2 + 0.5
41
+ end
42
+
43
+ Torch.cat([fade, ones]).clamp!(0, 1)
44
+ end
45
+
46
+ def fade_out(waveform_length)
47
+ fade = Torch.linspace(0, 1, @fade_out_len)
48
+ ones = Torch.ones(waveform_length - @fade_out_len)
49
+
50
+ if @fade_shape == "linear"
51
+ fade = - fade + 1
52
+ end
53
+
54
+ if @fade_shape == "exponential"
55
+ fade = Torch.pow(2, - fade) * (1 - fade)
56
+ end
57
+
58
+ if @fade_shape == "logarithmic"
59
+ fade = Torch.log10(1.1 - fade) + 1
60
+ end
61
+
62
+ if @fade_shape == "quarter_sine"
63
+ fade = Torch.sin(fade * Math::PI / 2 + Math::PI / 2)
64
+ end
65
+
66
+ if @fade_shape == "half_sine"
67
+ fade = Torch.sin(fade * Math::PI + Math::PI / 2) / 2 + 0.5
68
+ end
69
+
70
+ Torch.cat([ones, fade]).clamp!(0, 1)
71
+ end
72
+ end
73
+ end
74
+ end
@@ -1,6 +1,8 @@
1
1
  module TorchAudio
2
2
  module Transforms
3
3
  class MelSpectrogram < Torch::NN::Module
4
+ attr_reader :n_mels
5
+
4
6
  def initialize(
5
7
  sample_rate: 16000, n_fft: 400, win_length: nil, hop_length: nil, f_min: 0.0,
6
8
  f_max: nil, pad: 0, n_mels: 128, window_fn: Torch.method(:hann_window),
@@ -0,0 +1,43 @@
1
+ module TorchAudio
2
+ module Transforms
3
+ class MFCC < Torch::NN::Module
4
+
5
+ SUPPORTED_DCT_TYPES = [2]
6
+
7
+ def initialize(sample_rate: 16000, n_mfcc: 40, dct_type: 2, norm: :ortho, log_mels: false, melkwargs: {})
8
+ super()
9
+
10
+ raise ArgumentError, "DCT type not supported: #{dct_type}" unless SUPPORTED_DCT_TYPES.include?(dct_type)
11
+
12
+ @sample_rate = sample_rate
13
+ @n_mfcc = n_mfcc
14
+ @dct_type = dct_type
15
+ @norm = norm
16
+ @top_db = 80.0
17
+ @amplitude_to_db = TorchAudio::Transforms::AmplitudeToDB.new(stype: :power, top_db: @top_db)
18
+
19
+ @melspectrogram = TorchAudio::Transforms::MelSpectrogram.new(sample_rate: @sample_rate, **melkwargs)
20
+
21
+ raise ArgumentError, "Cannot select more MFCC coefficients than # mel bins" if @n_mfcc > @melspectrogram.n_mels
22
+
23
+ dct_mat = F.create_dct(@n_mfcc, @melspectrogram.n_mels, norm: @norm)
24
+ register_buffer('dct_mat', dct_mat)
25
+
26
+ @log_mels = log_mels
27
+ end
28
+
29
+ def forward(waveform)
30
+ mel_specgram = @melspectrogram.(waveform)
31
+ if @log_mels
32
+ mel_specgram = Torch.log(mel_specgram + 1e-6)
33
+ else
34
+ mel_specgram = @amplitude_to_db.(mel_specgram)
35
+ end
36
+
37
+ Torch
38
+ .matmul(mel_specgram.transpose(-2, -1), @dct_mat)
39
+ .transpose(-2, -1)
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,31 @@
1
+ module TorchAudio
2
+ module Transforms
3
+ class Vol < Torch::NN::Module
4
+ def initialize(gain, gain_type: "amplitude")
5
+ super()
6
+ @gain = gain
7
+ @gain_type = gain_type
8
+
9
+ if ["amplitude", "power"].include?(gain_type) && gain < 0
10
+ raise ArgumentError, "If gain_type = amplitude or power, gain must be positive."
11
+ end
12
+ end
13
+
14
+ def forward(waveform)
15
+ if @gain_type == "amplitude"
16
+ waveform = waveform * @gain
17
+ end
18
+
19
+ if @gain_type == "db"
20
+ waveform = F.gain(waveform, @gain)
21
+ end
22
+
23
+ if @gain_type == "power"
24
+ waveform = F.gain(waveform, 10 * Math.log10(@gain))
25
+ end
26
+
27
+ Torch.clamp(waveform, -1, 1)
28
+ end
29
+ end
30
+ end
31
+ end
@@ -1,3 +1,3 @@
1
1
  module TorchAudio
2
- VERSION = "0.2.0"
2
+ VERSION = "0.2.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: torchaudio
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-05-23 00:00:00.000000000 Z
11
+ date: 2021-07-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: torch-rb
@@ -64,11 +64,15 @@ files:
64
64
  - lib/torchaudio/datasets/yesno.rb
65
65
  - lib/torchaudio/functional.rb
66
66
  - lib/torchaudio/transforms/amplitude_to_db.rb
67
+ - lib/torchaudio/transforms/compute_deltas.rb
68
+ - lib/torchaudio/transforms/fade.rb
67
69
  - lib/torchaudio/transforms/mel_scale.rb
68
70
  - lib/torchaudio/transforms/mel_spectrogram.rb
71
+ - lib/torchaudio/transforms/mfcc.rb
69
72
  - lib/torchaudio/transforms/mu_law_decoding.rb
70
73
  - lib/torchaudio/transforms/mu_law_encoding.rb
71
74
  - lib/torchaudio/transforms/spectrogram.rb
75
+ - lib/torchaudio/transforms/vol.rb
72
76
  - lib/torchaudio/version.rb
73
77
  homepage: https://github.com/ankane/torchaudio
74
78
  licenses:
@@ -89,7 +93,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
89
93
  - !ruby/object:Gem::Version
90
94
  version: '0'
91
95
  requirements: []
92
- rubygems_version: 3.2.3
96
+ rubygems_version: 3.2.22
93
97
  signing_key:
94
98
  specification_version: 4
95
99
  summary: Data manipulation and transformation for audio signal processing