itak 1 → 2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1571ecc53ee18992338a73e4277e067576e8297ef871e9a748953ef34c8843c8
4
- data.tar.gz: 54a21a999995f4bf6db51023df49b2590df4a8296f8ceda7b0b012303cbd950a
3
+ metadata.gz: 05d3ad0b0b7a48d1053aa421e23a4d720f8069e3b767d52ffe305eb52266ea4c
4
+ data.tar.gz: 1ca65afc391b6d3b5b80c79478e8317ec87198785a3376e34feafe9b20aa86b7
5
5
  SHA512:
6
- metadata.gz: 5ce6ae6455378ce71b46ecc4e57e32fb2c70b018183cb615333cd303bf998dba6b5711a723a7b0381cb974ac2e297042036c653756fac7ef7b2cc76f3471d3a0
7
- data.tar.gz: 6cd9714589ff2ad220b718020237b107c618e0e4d78f34e0f5e42f0ef8ff1970d17250d0cef7505be68416c5cf49fddb1ddc08e14dd76f45c8e1dff7cdfa5b4b
6
+ metadata.gz: 682271f0f3c32b88b489f3465f40df61bfc20588f611fedd41cdea75b2935ed14259166993f8039d7833ffc8abd7e22af3cf0c41d91bad33f15068e22c9143a3
7
+ data.tar.gz: 07e5d6f02e6127ebc74835561ee44c9c0aaa57094ef6fd110f980b08838b698f244cabe64d798b8acd9abf698db2296a6e6d54fded5a19eeb887759a30e10d6e
data/README.md CHANGED
@@ -3,7 +3,12 @@ Itak
3
3
 
4
4
  Audio editing tool for podcasters.
5
5
 
6
- It denoises and removes non-active period from the audio file.
6
+ It does:
7
+
8
+ * remove noise, and
9
+ * remove non-active period
10
+
11
+ from the audio file.
7
12
 
8
13
  SYNOPSIS
9
14
  --------
data/bin/itak CHANGED
@@ -1,4 +1,5 @@
1
1
  require "optparse"
2
+ require "optparse/pathname"
2
3
  require "itak"
3
4
 
4
5
  def main(argv)
@@ -8,6 +9,8 @@ def main(argv)
8
9
  output = Itak.new.run(src, options[:output])
9
10
  $stderr.puts "Output written to"
10
11
  $stderr.puts output
12
+ rescue => err
13
+ abort err.message
11
14
  end
12
15
 
13
16
  def parse_options(argv)
@@ -18,7 +21,7 @@ def parse_options(argv)
18
21
 
19
22
  Usage: itak [options] INPUT
20
23
  EOB
21
- opt.on "-o", "--output=PATH", "Specify output file or directory" do |path|
24
+ opt.on "-o", "--output=PATH", "Specify output file or directory", Pathname do |path|
22
25
  options[:output] = path
23
26
  end
24
27
  }.parse!(argv)
data/itak.gemspec CHANGED
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = "itak"
3
- s.version = "1"
3
+ s.version = "2"
4
4
  s.authors = ["Kitaiti Makoto"]
5
5
  s.email = ["KitaitiMaokto@gmail.com"]
6
6
  s.summary = "Audio editing tool for podcasters"
@@ -11,9 +11,16 @@ Gem::Specification.new do |s|
11
11
  s.files = `git ls-files -z`.split("\x0")
12
12
  s.executables = s.files.filter_map {|f| File.basename(f) if f.start_with?("bin/")}
13
13
 
14
- s.add_runtime_dependency "gtcrn"
15
- s.add_runtime_dependency "whispercpp"
16
- s.add_runtime_dependency "torchaudio"
14
+ s.add_runtime_dependency "gtcrn", ">= 0.0.2"
15
+ s.add_runtime_dependency "whispercpp", ">= 1.3.6"
16
+ s.add_runtime_dependency "torchaudio", ">= 0.5.0"
17
+ s.add_runtime_dependency "torchcodec"
18
+ s.add_runtime_dependency "red-arrow"
19
+ s.add_runtime_dependency "numo-narray-alt"
20
+ s.add_runtime_dependency "red-arrow-numo-narray"
21
+ s.add_runtime_dependency "torch-rb"
22
+ s.add_runtime_dependency "optparse-pathname"
23
+ s.add_runtime_dependency "ndav-torch-tensor"
17
24
 
18
25
  s.add_development_dependency "rake"
19
26
  s.add_development_dependency "rubygems-tasks"
data/lib/itak/denoiser.rb CHANGED
@@ -1,16 +1,13 @@
1
- require "fileutils"
2
1
  require "gtcrn"
3
2
 
4
3
  class Itak
5
4
  class Denoiser
6
- def initialize(work_dir: nil)
7
- @work_dir = work_dir ? Pathname(work_dir) : nil
5
+ def initialize
6
+ @gtcrn = GTCRN.new
8
7
  end
9
8
 
10
- def denoise(src, dest: nil)
11
- dir = @work_dir || Pathname.mktmpdir
12
- dest ||= dir/src.sub_ext(".denoised" + src.extname).basename
13
- GTCRN.new.enhance_speech(src, dest)
9
+ def denoise(src)
10
+ @gtcrn.enhance_speech_waveform(src)
14
11
  end
15
12
  end
16
13
  end
data/lib/itak/vad.rb CHANGED
@@ -1,33 +1,27 @@
1
- require "pathname"
2
- require "fileutils"
3
1
  require "whisper"
4
- require "torchaudio"
2
+ require "ndav/torch/tensor"
5
3
 
6
4
  class Itak
7
5
  class VAD
6
+ PARAMS = Whisper::VAD::Params.new(
7
+ threshold: 0.7,
8
+ min_silence_duration_ms: 1500
9
+ )
10
+
8
11
  def initialize(model: "silero-v6.2.0")
9
- @model = model
12
+ @vad = Whisper::VAD::Context.new(model)
10
13
  end
11
14
 
12
- def run(src, dest, params: nil)
13
- src = Pathname(src)
14
- dest = Pathname(dest)
15
- context = Whisper::VAD::Context.new(@model)
16
- params ||= Whisper::VAD::Params.new(
17
- threshold: 0.7,
18
- min_silence_duration_ms: 1000
19
- )
20
- waveform, sample_rate = TorchAudio.load(src)
15
+ def run(src, params: PARAMS)
16
+ src = src[0] if src.ndim == 2 && src.shape[0] == 1
21
17
  chunks = []
22
- context.detect(src.to_path, params).each do |segment|
18
+ @vad.segments_from_samples(params, src.to_ndav).each do |segment|
23
19
  segment => {start_time:, end_time:}
24
- st = start_time * sample_rate / 1000
25
- en = end_time * sample_rate / 1000
26
- chunks << waveform[0][st..en]
20
+ st = start_time * SAMPLE_RATE / 1000
21
+ en = end_time * SAMPLE_RATE / 1000
22
+ chunks << src[st..en]
27
23
  end
28
- output = Torch.cat(chunks)
29
- TorchAudio.save(dest.to_path, output.unsqueeze(0), sample_rate)
30
- dest
24
+ Torch.cat(chunks)
31
25
  end
32
26
  end
33
27
  end
data/lib/itak.rb CHANGED
@@ -1,10 +1,12 @@
1
1
  require "pathname"
2
- require "tmpdir"
2
+ require "torchaudio"
3
3
 
4
4
  require "itak/denoiser"
5
5
  require "itak/vad"
6
6
 
7
7
  class Itak
8
+ SAMPLE_RATE = 16_000
9
+
8
10
  def run(src, dest=nil)
9
11
  src = Pathname(src)
10
12
  if dest
@@ -15,13 +17,19 @@ class Itak
15
17
  else
16
18
  dest = src.sub_ext(".denoised-vad" + src.extname)
17
19
  end
20
+ dest.dirname.mkpath
18
21
 
19
- Pathname.mktmpdir do |work_dir|
20
- # TODO: Use memory instead of files to pipeline data
21
- denoised = Denoiser.new(work_dir:).denoise(src)
22
- VAD.new.run(denoised, dest)
23
- end
22
+ waveform, sample_rate = TorchAudio.load(src.to_path)
23
+ raise "Currently only #{SAMPLE_RATE}Hz audio is supported" unless sample_rate == SAMPLE_RATE
24
+
25
+ $stderr.puts "Denoising..."
26
+ denoised = Denoiser.new.denoise(waveform)
27
+
28
+ $stderr.puts "VAD..."
29
+ vadded = VAD.new.run(denoised)
24
30
 
31
+ TorchAudio.save(dest.to_path, vadded.unsqueeze(0), SAMPLE_RATE)
32
+ $stderr.puts "Saved to #{dest}"
25
33
  dest
26
34
  end
27
35
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: itak
3
3
  version: !ruby/object:Gem::Version
4
- version: '1'
4
+ version: '2'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kitaiti Makoto
@@ -11,6 +11,48 @@ date: 1980-01-02 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: gtcrn
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: 0.0.2
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - ">="
24
+ - !ruby/object:Gem::Version
25
+ version: 0.0.2
26
+ - !ruby/object:Gem::Dependency
27
+ name: whispercpp
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.3.6
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: 1.3.6
40
+ - !ruby/object:Gem::Dependency
41
+ name: torchaudio
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: 0.5.0
47
+ type: :runtime
48
+ prerelease: false
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: 0.5.0
54
+ - !ruby/object:Gem::Dependency
55
+ name: torchcodec
14
56
  requirement: !ruby/object:Gem::Requirement
15
57
  requirements:
16
58
  - - ">="
@@ -24,7 +66,7 @@ dependencies:
24
66
  - !ruby/object:Gem::Version
25
67
  version: '0'
26
68
  - !ruby/object:Gem::Dependency
27
- name: whispercpp
69
+ name: red-arrow
28
70
  requirement: !ruby/object:Gem::Requirement
29
71
  requirements:
30
72
  - - ">="
@@ -38,7 +80,63 @@ dependencies:
38
80
  - !ruby/object:Gem::Version
39
81
  version: '0'
40
82
  - !ruby/object:Gem::Dependency
41
- name: torchaudio
83
+ name: numo-narray-alt
84
+ requirement: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ type: :runtime
90
+ prerelease: false
91
+ version_requirements: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ version: '0'
96
+ - !ruby/object:Gem::Dependency
97
+ name: red-arrow-numo-narray
98
+ requirement: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ version: '0'
103
+ type: :runtime
104
+ prerelease: false
105
+ version_requirements: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - ">="
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ - !ruby/object:Gem::Dependency
111
+ name: torch-rb
112
+ requirement: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - ">="
115
+ - !ruby/object:Gem::Version
116
+ version: '0'
117
+ type: :runtime
118
+ prerelease: false
119
+ version_requirements: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ version: '0'
124
+ - !ruby/object:Gem::Dependency
125
+ name: optparse-pathname
126
+ requirement: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - ">="
129
+ - !ruby/object:Gem::Version
130
+ version: '0'
131
+ type: :runtime
132
+ prerelease: false
133
+ version_requirements: !ruby/object:Gem::Requirement
134
+ requirements:
135
+ - - ">="
136
+ - !ruby/object:Gem::Version
137
+ version: '0'
138
+ - !ruby/object:Gem::Dependency
139
+ name: ndav-torch-tensor
42
140
  requirement: !ruby/object:Gem::Requirement
43
141
  requirements:
44
142
  - - ">="
@@ -189,7 +287,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
189
287
  - !ruby/object:Gem::Version
190
288
  version: '0'
191
289
  requirements: []
192
- rubygems_version: 4.0.3
290
+ rubygems_version: 4.0.6
193
291
  specification_version: 4
194
292
  summary: Audio editing tool for podcasters
195
293
  test_files: []