gtcrn 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (6) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +28 -1
  3. data/gtcrn.gemspec +3 -2
  4. data/lib/gtcrn.rb +23 -10
  5. data/test/test_gtcrn.rb +27 -0
  6. metadata +16 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e62dd7f2ba7da50ab841769e0ccc9d0aeef8f0a31499a2a59b80ffc749b1ca9b
4
- data.tar.gz: 555a5c91d412822e1e066bb1ea4bccdfd1cdf9e334adad025b886aea3a2b62f3
3
+ metadata.gz: e838b8e452d988facb9cf6cfaa99c8a1b4e9c48073d7a035a07d3b3e42461ec8
4
+ data.tar.gz: 433c73cbe706d29786d62499ad17eb8b629ab5f5fc640aa3c8e999a56aa46397
5
5
  SHA512:
6
- metadata.gz: c82eb21211da591054d65de5aa131219432f20ce161b53b750d961baed6bd5e3f840076ff17dcdf9568a059257544de648a33c1f7d51911e0513c2e855bb3e2e
7
- data.tar.gz: 50814826a17fb42e874f7ea7bd8fae2366f4b08d2961c2e354f4b8b9e4346199d37e61495c4a236cafc46f9886c2a1a9ba6f4b4e9b1b5f5e6f837a0a16265f03
6
+ metadata.gz: 21fd283b28ca5b35b7f9f02dab22cef8d4e342af47e7556594b97e70643351e18ff77a9da542e6b5d3963fbb812922740f00f6067b780e95409eb081111b8806
7
+ data.tar.gz: bc8d8aa599c3b53ef4801bebcb1ac5ef49e11349c71753292d4eeeecfc65bfb495d40a65df5cbe4ba8d14e6db17fe7e16ff25ff01417484a8db0f981ab8dac36
data/README.md CHANGED
@@ -16,7 +16,32 @@ output = GTCRN.new.enhance_speech("path/to/audio.wav")
16
16
  # => <Pathname:path/to/audio.enhanced.wav>
17
17
  ````
18
18
 
19
- Audio file must be mono WAV with 16kHz sampling rate and 16-bit per sample.
19
+ Audio file must be with 16kHz sampling rate and 16-bit per sample. Currently, file formats supported by [TorchAudio Ruby][] ([TorchCodec Ruby][]) are available.
20
+
21
+ INSTALLATION
22
+ ------------
23
+
24
+ This gem depends on [Torch.rb][], [TorchAudio Ruby][] and [TorchCodec Ruby][] which require precompiled libtorch and being built with it.
25
+
26
+ % wget https://download.pytorch.org/libtorch/cpu/libtorch-macos-arm64-2.10.0.zip # See https://pytorch.org/get-started/locally/ for download URI for your environment
27
+ % unzip -d path/to/libtorch libtorch-macos-arm64-2.10.0.zip
28
+ % gem install torch-rb -- --with-torch-dir=path/to/libtorch
29
+ % gem install torchaudio -- --with-torch-dir=path/to/libtorch
30
+ % gem install torchcodec -- --with-torch-dir=path/to/libtorch
31
+ % gem install gtcrn
32
+
33
+ Or,
34
+
35
+ % bundle config set --local build.torch-rb --with-torch-dir=path/to/libtorch
36
+ % bundle config set --local build.torchaudio --with-torch-dir=path/to/libtorch
37
+ % bundle config set --local build.torchcodec --with-torch-dir=path/to/libtorch
38
+ % bundle install
39
+
40
+ These instructions might be outdated. Refer to each library's instruction if you have trouble.
41
+
42
+ [Torch.rb]: https://github.com/ankane/torch.rb
43
+ [TorchAudio Ruby]: https://github.com/ankane/torchaudio-ruby
44
+ [TorchCodec Ruby]: https://github.com/ankane/torchcodec-ruby
20
45
 
21
46
  CLI
22
47
  ---
@@ -44,6 +69,8 @@ enhanced = GTCRN.new.enhance_speech_waveform(waveform)
44
69
  TorchAudio.save("path/to/output.wav", enhanced.squeeze, sample_rate)
45
70
  ```
46
71
 
72
+ `GTCRN#enhance_speech_waveform` enhances each channel separately if you pass multi-channel audio.
73
+
47
74
  LICENSE
48
75
  -------
49
76
 
data/gtcrn.gemspec CHANGED
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = "gtcrn"
3
- s.version = "0.0.2"
3
+ s.version = "0.0.3"
4
4
  s.authors = ["Kitaiti Makoto"]
5
5
  s.email = ["KitaitiMakoto@gmail.com"]
6
6
  s.summary = "Denoises audio"
@@ -16,7 +16,8 @@ Gem::Specification.new do |s|
16
16
 
17
17
  s.add_runtime_dependency "onnxruntime"
18
18
  s.add_runtime_dependency "torch-rb"
19
- s.add_runtime_dependency "torchaudio"
19
+ s.add_runtime_dependency "torchaudio", ">= 0.5.0"
20
+ s.add_runtime_dependency "torchcodec"
20
21
  s.add_runtime_dependency "numo-narray-alt"
21
22
 
22
23
  s.add_development_dependency "rake"
data/lib/gtcrn.rb CHANGED
@@ -19,6 +19,7 @@ class GTCRN
19
19
 
20
20
  def initialize
21
21
  @session = OnnxRuntime::InferenceSession.new(MODEL_PATH)
22
+ @output_names = @session.outputs.collect {|output| output[:name]}
22
23
  end
23
24
 
24
25
  def enhance_speech(path, dest=nil)
@@ -41,31 +42,43 @@ class GTCRN
41
42
  end
42
43
 
43
44
  def enhance_speech_waveform(waveform)
44
- conv_cache, tra_cache, inter_cache = 1.upto(3).collect {|i| Numo::SFloat.zeros(*@session.inputs[i][:shape]) }
45
+ ndim = waveform.ndim
46
+ unless ndim == 1 or ndim == 2
47
+ raise ArgumentError, "wrong dimension of argment (given #{ndim}, expected 1D or 2D"
48
+ end
49
+ waveform = [waveform] if ndim == 1
50
+ channels = waveform.collect {|channel| enhance_speech_waveform_channel(channel)}
51
+ ndim == 1 ? channels[0] : Torch.stack(channels)
52
+ end
53
+
54
+ def enhance_speech_waveform_channel(channel)
55
+ conv_cache, tra_cache, inter_cache = 1.upto(3).collect {|i|
56
+ OnnxRuntime::OrtValue.from_numo(
57
+ Numo::SFloat.zeros(*@session.inputs[i][:shape])
58
+ )
59
+ }
45
60
  inputs = Torch.view_as_real(
46
- Torch.stft(waveform[0], **STFT_OPTS)[nil]
61
+ Torch.stft(channel, **STFT_OPTS)[nil]
47
62
  ).numo
48
63
  outputs = []
49
64
  inputs.shape[-2].times do |i|
50
65
  enh, conv_cache, tra_cache, inter_cache = @session.run(
51
- @session.outputs.collect {|output| output[:name]},
66
+ @output_names,
52
67
  {
53
68
  mix: OnnxRuntime::OrtValue.from_numo(inputs[0.., 0.., i..i, 0..]),
54
- conv_cache: OnnxRuntime::OrtValue.from_numo(conv_cache),
55
- tra_cache: OnnxRuntime::OrtValue.from_numo(tra_cache),
56
- inter_cache: OnnxRuntime::OrtValue.from_numo(inter_cache)
69
+ conv_cache:, tra_cache:, inter_cache:,
57
70
  },
58
- output_type: :numo
71
+ output_type: :ort_value
59
72
  )
60
- outputs << enh
73
+ outputs << enh.numo
61
74
  end
62
75
  concated = Numo::NArray.concatenate(outputs, axis: 2)
63
76
  real = concated[0.., 0.., 0.., 0]
64
77
  imag = concated[0.., 0.., 0.., 1]
65
78
  enhanced = Torch.istft(
66
- Torch.from_numo(real) + 1i * Torch.from_numo(imag),
79
+ Torch.complex(Torch.from_numo(real), Torch.from_numo(imag)),
67
80
  **ISTFT_OPTS
68
81
  )
69
- enhanced.squeeze
82
+ enhanced.squeeze(0)
70
83
  end
71
84
  end
data/test/test_gtcrn.rb CHANGED
@@ -19,4 +19,31 @@ class TestGTCRN < Test::Unit::TestCase
19
19
  assert source.total_sample_frames - enhanced.total_sample_frames < 512
20
20
  end
21
21
  end
22
+
23
+ def test_enhance_speech_waveform_one_dim
24
+ waveform = Torch.rand(16000)
25
+ enhanced = GTCRN.new.enhance_speech_waveform(waveform)
26
+ assert_equal waveform.ndim, enhanced.ndim
27
+ end
28
+
29
+ def test_enhance_speech_waveform_two_dim
30
+ channels = 5
31
+ gtcrn = GTCRN.new
32
+ waveform = Torch.rand(channels, 16000)
33
+ enhanced = gtcrn.enhance_speech_waveform(waveform)
34
+
35
+ assert_equal waveform.shape[0..-2], enhanced.shape[0..-2]
36
+
37
+ 0.upto(channels - 1) do |i|
38
+ enh = gtcrn.enhance_speech_waveform(waveform[i])
39
+ assert enh.equal(enhanced[i])
40
+ end
41
+ end
42
+
43
+ def test_enhance_speech_waveform_channel
44
+ gtcrn = GTCRN.new
45
+ channel = Torch.rand(16000)
46
+ enhanced = gtcrn.enhance_speech_waveform_channel(channel)
47
+ assert_equal channel.ndim, enhanced.ndim
48
+ end
22
49
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gtcrn
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kitaiti Makoto
@@ -39,6 +39,20 @@ dependencies:
39
39
  version: '0'
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: torchaudio
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: 0.5.0
47
+ type: :runtime
48
+ prerelease: false
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: 0.5.0
54
+ - !ruby/object:Gem::Dependency
55
+ name: torchcodec
42
56
  requirement: !ruby/object:Gem::Requirement
43
57
  requirements:
44
58
  - - ">="
@@ -188,7 +202,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
188
202
  - !ruby/object:Gem::Version
189
203
  version: '0'
190
204
  requirements: []
191
- rubygems_version: 4.0.3
205
+ rubygems_version: 4.0.6
192
206
  specification_version: 4
193
207
  summary: Denoises audio
194
208
  test_files: []