gtcrn 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +28 -1
- data/gtcrn.gemspec +3 -2
- data/lib/gtcrn.rb +23 -10
- data/test/test_gtcrn.rb +27 -0
- metadata +16 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e838b8e452d988facb9cf6cfaa99c8a1b4e9c48073d7a035a07d3b3e42461ec8
|
|
4
|
+
data.tar.gz: 433c73cbe706d29786d62499ad17eb8b629ab5f5fc640aa3c8e999a56aa46397
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 21fd283b28ca5b35b7f9f02dab22cef8d4e342af47e7556594b97e70643351e18ff77a9da542e6b5d3963fbb812922740f00f6067b780e95409eb081111b8806
|
|
7
|
+
data.tar.gz: bc8d8aa599c3b53ef4801bebcb1ac5ef49e11349c71753292d4eeeecfc65bfb495d40a65df5cbe4ba8d14e6db17fe7e16ff25ff01417484a8db0f981ab8dac36
|
data/README.md
CHANGED
|
@@ -16,7 +16,32 @@ output = GTCRN.new.enhance_speech("path/to/audio.wav")
|
|
|
16
16
|
# => <Pathname:path/to/audio.enhanced.wav>
|
|
17
17
|
````
|
|
18
18
|
|
|
19
|
-
Audio file must be
|
|
19
|
+
Audio file must be with 16kHz sampling rate and 16-bit per sample. Currently, file formats supported by [TorchAudio Ruby][] ([TorchCodec Ruby][]) are available.
|
|
20
|
+
|
|
21
|
+
INSTALLATION
|
|
22
|
+
------------
|
|
23
|
+
|
|
24
|
+
This gem depends on [Torch.rb][], [TorchAudio Ruby][] and [TorchCodec Ruby][] which require precompiled libtorch and being built with it.
|
|
25
|
+
|
|
26
|
+
% wget https://download.pytorch.org/libtorch/cpu/libtorch-macos-arm64-2.10.0.zip # See https://pytorch.org/get-started/locally/ for download URI for your environment
|
|
27
|
+
% unzip -d path/to/libtorch libtorch-macos-arm64-2.10.0.zip
|
|
28
|
+
% gem install torch-rb -- --with-torch-dir=path/to/libtorch
|
|
29
|
+
% gem install torchaudio -- --with-torch-dir=path/to/libtorch
|
|
30
|
+
% gem install torchcodec -- --with-torch-dir=path/to/libtorch
|
|
31
|
+
% gem install gtcrn
|
|
32
|
+
|
|
33
|
+
Or,
|
|
34
|
+
|
|
35
|
+
% bundle config set --local build.torch-rb --with-torch-dir=path/to/libtorch
|
|
36
|
+
% bundle config set --local build.torchaudio --with-torch-dir=path/to/libtorch
|
|
37
|
+
% bundle config set --local build.torchcodec --with-torch-dir=path/to/libtorch
|
|
38
|
+
% bundle install
|
|
39
|
+
|
|
40
|
+
These instructions might be outdated. Refer to each library's instruction if you have trouble.
|
|
41
|
+
|
|
42
|
+
[Torch.rb]: https://github.com/ankane/torch.rb
|
|
43
|
+
[TorchAudio Ruby]: https://github.com/ankane/torchaudio-ruby
|
|
44
|
+
[TorchCodec Ruby]: https://github.com/ankane/torchcodec-ruby
|
|
20
45
|
|
|
21
46
|
CLI
|
|
22
47
|
---
|
|
@@ -44,6 +69,8 @@ enhanced = GTCRN.new.enhance_speech_waveform(waveform)
|
|
|
44
69
|
TorchAudio.save("path/to/output.wav", enhanced.squeeze, sample_rate)
|
|
45
70
|
```
|
|
46
71
|
|
|
72
|
+
`GTCRN#enhance_speech_waveform` enhances each channel separately if you pass multi-channel audio.
|
|
73
|
+
|
|
47
74
|
LICENSE
|
|
48
75
|
-------
|
|
49
76
|
|
data/gtcrn.gemspec
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Gem::Specification.new do |s|
|
|
2
2
|
s.name = "gtcrn"
|
|
3
|
-
s.version = "0.0.
|
|
3
|
+
s.version = "0.0.3"
|
|
4
4
|
s.authors = ["Kitaiti Makoto"]
|
|
5
5
|
s.email = ["KitaitiMakoto@gmail.com"]
|
|
6
6
|
s.summary = "Denoises audio"
|
|
@@ -16,7 +16,8 @@ Gem::Specification.new do |s|
|
|
|
16
16
|
|
|
17
17
|
s.add_runtime_dependency "onnxruntime"
|
|
18
18
|
s.add_runtime_dependency "torch-rb"
|
|
19
|
-
s.add_runtime_dependency "torchaudio"
|
|
19
|
+
s.add_runtime_dependency "torchaudio", ">= 0.5.0"
|
|
20
|
+
s.add_runtime_dependency "torchcodec"
|
|
20
21
|
s.add_runtime_dependency "numo-narray-alt"
|
|
21
22
|
|
|
22
23
|
s.add_development_dependency "rake"
|
data/lib/gtcrn.rb
CHANGED
|
@@ -19,6 +19,7 @@ class GTCRN
|
|
|
19
19
|
|
|
20
20
|
def initialize
|
|
21
21
|
@session = OnnxRuntime::InferenceSession.new(MODEL_PATH)
|
|
22
|
+
@output_names = @session.outputs.collect {|output| output[:name]}
|
|
22
23
|
end
|
|
23
24
|
|
|
24
25
|
def enhance_speech(path, dest=nil)
|
|
@@ -41,31 +42,43 @@ class GTCRN
|
|
|
41
42
|
end
|
|
42
43
|
|
|
43
44
|
def enhance_speech_waveform(waveform)
|
|
44
|
-
|
|
45
|
+
ndim = waveform.ndim
|
|
46
|
+
unless ndim == 1 or ndim == 2
|
|
47
|
+
raise ArgumentError, "wrong dimension of argment (given #{ndim}, expected 1D or 2D"
|
|
48
|
+
end
|
|
49
|
+
waveform = [waveform] if ndim == 1
|
|
50
|
+
channels = waveform.collect {|channel| enhance_speech_waveform_channel(channel)}
|
|
51
|
+
ndim == 1 ? channels[0] : Torch.stack(channels)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def enhance_speech_waveform_channel(channel)
|
|
55
|
+
conv_cache, tra_cache, inter_cache = 1.upto(3).collect {|i|
|
|
56
|
+
OnnxRuntime::OrtValue.from_numo(
|
|
57
|
+
Numo::SFloat.zeros(*@session.inputs[i][:shape])
|
|
58
|
+
)
|
|
59
|
+
}
|
|
45
60
|
inputs = Torch.view_as_real(
|
|
46
|
-
Torch.stft(
|
|
61
|
+
Torch.stft(channel, **STFT_OPTS)[nil]
|
|
47
62
|
).numo
|
|
48
63
|
outputs = []
|
|
49
64
|
inputs.shape[-2].times do |i|
|
|
50
65
|
enh, conv_cache, tra_cache, inter_cache = @session.run(
|
|
51
|
-
@
|
|
66
|
+
@output_names,
|
|
52
67
|
{
|
|
53
68
|
mix: OnnxRuntime::OrtValue.from_numo(inputs[0.., 0.., i..i, 0..]),
|
|
54
|
-
conv_cache
|
|
55
|
-
tra_cache: OnnxRuntime::OrtValue.from_numo(tra_cache),
|
|
56
|
-
inter_cache: OnnxRuntime::OrtValue.from_numo(inter_cache)
|
|
69
|
+
conv_cache:, tra_cache:, inter_cache:,
|
|
57
70
|
},
|
|
58
|
-
output_type: :
|
|
71
|
+
output_type: :ort_value
|
|
59
72
|
)
|
|
60
|
-
outputs << enh
|
|
73
|
+
outputs << enh.numo
|
|
61
74
|
end
|
|
62
75
|
concated = Numo::NArray.concatenate(outputs, axis: 2)
|
|
63
76
|
real = concated[0.., 0.., 0.., 0]
|
|
64
77
|
imag = concated[0.., 0.., 0.., 1]
|
|
65
78
|
enhanced = Torch.istft(
|
|
66
|
-
Torch.from_numo(real)
|
|
79
|
+
Torch.complex(Torch.from_numo(real), Torch.from_numo(imag)),
|
|
67
80
|
**ISTFT_OPTS
|
|
68
81
|
)
|
|
69
|
-
enhanced.squeeze
|
|
82
|
+
enhanced.squeeze(0)
|
|
70
83
|
end
|
|
71
84
|
end
|
data/test/test_gtcrn.rb
CHANGED
|
@@ -19,4 +19,31 @@ class TestGTCRN < Test::Unit::TestCase
|
|
|
19
19
|
assert source.total_sample_frames - enhanced.total_sample_frames < 512
|
|
20
20
|
end
|
|
21
21
|
end
|
|
22
|
+
|
|
23
|
+
def test_enhance_speech_waveform_one_dim
|
|
24
|
+
waveform = Torch.rand(16000)
|
|
25
|
+
enhanced = GTCRN.new.enhance_speech_waveform(waveform)
|
|
26
|
+
assert_equal waveform.ndim, enhanced.ndim
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def test_enhance_speech_waveform_two_dim
|
|
30
|
+
channels = 5
|
|
31
|
+
gtcrn = GTCRN.new
|
|
32
|
+
waveform = Torch.rand(channels, 16000)
|
|
33
|
+
enhanced = gtcrn.enhance_speech_waveform(waveform)
|
|
34
|
+
|
|
35
|
+
assert_equal waveform.shape[0..-2], enhanced.shape[0..-2]
|
|
36
|
+
|
|
37
|
+
0.upto(channels - 1) do |i|
|
|
38
|
+
enh = gtcrn.enhance_speech_waveform(waveform[i])
|
|
39
|
+
assert enh.equal(enhanced[i])
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def test_enhance_speech_waveform_channel
|
|
44
|
+
gtcrn = GTCRN.new
|
|
45
|
+
channel = Torch.rand(16000)
|
|
46
|
+
enhanced = gtcrn.enhance_speech_waveform_channel(channel)
|
|
47
|
+
assert_equal channel.ndim, enhanced.ndim
|
|
48
|
+
end
|
|
22
49
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: gtcrn
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Kitaiti Makoto
|
|
@@ -39,6 +39,20 @@ dependencies:
|
|
|
39
39
|
version: '0'
|
|
40
40
|
- !ruby/object:Gem::Dependency
|
|
41
41
|
name: torchaudio
|
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
|
43
|
+
requirements:
|
|
44
|
+
- - ">="
|
|
45
|
+
- !ruby/object:Gem::Version
|
|
46
|
+
version: 0.5.0
|
|
47
|
+
type: :runtime
|
|
48
|
+
prerelease: false
|
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - ">="
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: 0.5.0
|
|
54
|
+
- !ruby/object:Gem::Dependency
|
|
55
|
+
name: torchcodec
|
|
42
56
|
requirement: !ruby/object:Gem::Requirement
|
|
43
57
|
requirements:
|
|
44
58
|
- - ">="
|
|
@@ -188,7 +202,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
188
202
|
- !ruby/object:Gem::Version
|
|
189
203
|
version: '0'
|
|
190
204
|
requirements: []
|
|
191
|
-
rubygems_version: 4.0.
|
|
205
|
+
rubygems_version: 4.0.6
|
|
192
206
|
specification_version: 4
|
|
193
207
|
summary: Denoises audio
|
|
194
208
|
test_files: []
|