itak 1 → 3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1571ecc53ee18992338a73e4277e067576e8297ef871e9a748953ef34c8843c8
4
- data.tar.gz: 54a21a999995f4bf6db51023df49b2590df4a8296f8ceda7b0b012303cbd950a
3
+ metadata.gz: a7ab66466c640ba4b91cfdf41de938434bdf3decd225c606213ff0aa91a9cff2
4
+ data.tar.gz: 8eabbd49439e13cd6d1831a7fc5fd83a391538f89a06e9c69640ba3ec5006ace
5
5
  SHA512:
6
- metadata.gz: 5ce6ae6455378ce71b46ecc4e57e32fb2c70b018183cb615333cd303bf998dba6b5711a723a7b0381cb974ac2e297042036c653756fac7ef7b2cc76f3471d3a0
7
- data.tar.gz: 6cd9714589ff2ad220b718020237b107c618e0e4d78f34e0f5e42f0ef8ff1970d17250d0cef7505be68416c5cf49fddb1ddc08e14dd76f45c8e1dff7cdfa5b4b
6
+ metadata.gz: 0f7278553178a3d2fa2de089f14a71f5079814528772adccd5fd1897056b31fb5a624efe5b73c2a8779f6d3bb7e8ee213cea8aea8fc26a1f9984967b9b6a75c7
7
+ data.tar.gz: ed4f9e3765197050aa8061e52cad8cfcbed747f00c2433e59f80fa7ce1a715255be4ec0e36ecc7d78cae85dc7fc8463d4aa2fc7154eee3e12eda0b882b7d069c
data/README.md CHANGED
@@ -3,7 +3,12 @@ Itak
3
3
 
4
4
  Audio editing tool for podcasters.
5
5
 
6
- It denoises and removes non-active period from the audio file.
6
+ It does:
7
+
8
+ * remove noise, and
9
+ * remove non-active period
10
+
11
+ from the audio file.
7
12
 
8
13
  SYNOPSIS
9
14
  --------
data/bin/itak CHANGED
@@ -1,4 +1,5 @@
1
1
  require "optparse"
2
+ require "optparse/pathname"
2
3
  require "itak"
3
4
 
4
5
  def main(argv)
@@ -8,6 +9,8 @@ def main(argv)
8
9
  output = Itak.new.run(src, options[:output])
9
10
  $stderr.puts "Output written to"
10
11
  $stderr.puts output
12
+ rescue => err
13
+ abort err.message
11
14
  end
12
15
 
13
16
  def parse_options(argv)
@@ -18,7 +21,7 @@ def parse_options(argv)
18
21
 
19
22
  Usage: itak [options] INPUT
20
23
  EOB
21
- opt.on "-o", "--output=PATH", "Specify output file or directory" do |path|
24
+ opt.on "-o", "--output=PATH", "Specify output file or directory", Pathname do |path|
22
25
  options[:output] = path
23
26
  end
24
27
  }.parse!(argv)
data/itak.gemspec CHANGED
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = "itak"
3
- s.version = "1"
3
+ s.version = "3"
4
4
  s.authors = ["Kitaiti Makoto"]
5
5
  s.email = ["KitaitiMaokto@gmail.com"]
6
6
  s.summary = "Audio editing tool for podcasters"
@@ -11,9 +11,13 @@ Gem::Specification.new do |s|
11
11
  s.files = `git ls-files -z`.split("\x0")
12
12
  s.executables = s.files.filter_map {|f| File.basename(f) if f.start_with?("bin/")}
13
13
 
14
- s.add_runtime_dependency "gtcrn"
15
- s.add_runtime_dependency "whispercpp"
16
- s.add_runtime_dependency "torchaudio"
14
+ s.add_runtime_dependency "gtcrn", ">= 0.0.2"
15
+ s.add_runtime_dependency "whispercpp", ">= 1.3.6"
16
+ s.add_runtime_dependency "torchaudio", ">= 0.5.0"
17
+ s.add_runtime_dependency "torchcodec"
18
+ s.add_runtime_dependency "torch-rb"
19
+ s.add_runtime_dependency "optparse-pathname"
20
+ s.add_runtime_dependency "ndav-torch-tensor"
17
21
 
18
22
  s.add_development_dependency "rake"
19
23
  s.add_development_dependency "rubygems-tasks"
data/lib/itak/denoiser.rb CHANGED
@@ -1,16 +1,13 @@
1
- require "fileutils"
2
1
  require "gtcrn"
3
2
 
4
3
  class Itak
5
4
  class Denoiser
6
- def initialize(work_dir: nil)
7
- @work_dir = work_dir ? Pathname(work_dir) : nil
5
+ def initialize
6
+ @gtcrn = GTCRN.new
8
7
  end
9
8
 
10
- def denoise(src, dest: nil)
11
- dir = @work_dir || Pathname.mktmpdir
12
- dest ||= dir/src.sub_ext(".denoised" + src.extname).basename
13
- GTCRN.new.enhance_speech(src, dest)
9
+ def denoise(src)
10
+ @gtcrn.enhance_speech_waveform(src)
14
11
  end
15
12
  end
16
13
  end
@@ -0,0 +1,39 @@
1
+ require "whisper"
2
+
3
+ class Itak
4
+ class Transcriber
5
+ PARAMS = {
6
+ language: "ja",
7
+ temperature: 1.0
8
+ }
9
+
10
+ def initialize(model: "large-v3-turbo-q8_0")
11
+ Whisper.log_set proc {}, nil
12
+ @whisper = Whisper::Context.new(model)
13
+ end
14
+
15
+ def run(src, params: {})
16
+ params = Whisper::Params.new(**PARAMS.merge(params))
17
+ src = src[0] if src.ndim == 2 && src.shape[0] == 1
18
+ @whisper
19
+ .full(params, src.to_ndav)
20
+ .each_segment
21
+ .collect {|segment|
22
+ "[%<start_time>s --> %<end_time>s]%<text>s" % {
23
+ start_time: format_time(segment.start_time),
24
+ end_time: format_time(segment.end_time),
25
+ text: segment.text
26
+ }
27
+ }.join("\n\n")
28
+ end
29
+
30
+ private
31
+
32
+ def format_time(time_ms)
33
+ sec, decimal_part = time_ms.divmod(1000)
34
+ min, sec = sec.divmod(60)
35
+ hour, min = min.divmod(60)
36
+ "%02d:%02d:%02d" % [hour, min, sec]
37
+ end
38
+ end
39
+ end
data/lib/itak/vad.rb CHANGED
@@ -1,33 +1,28 @@
1
- require "pathname"
2
- require "fileutils"
3
1
  require "whisper"
4
- require "torchaudio"
2
+ require "ndav/torch/tensor"
5
3
 
6
4
  class Itak
7
5
  class VAD
6
+ PARAMS = Whisper::VAD::Params.new(
7
+ threshold: 0.7,
8
+ min_silence_duration_ms: 1500
9
+ )
10
+
8
11
  def initialize(model: "silero-v6.2.0")
9
- @model = model
12
+ Whisper.log_set proc {}, nil
13
+ @vad = Whisper::VAD::Context.new(model)
10
14
  end
11
15
 
12
- def run(src, dest, params: nil)
13
- src = Pathname(src)
14
- dest = Pathname(dest)
15
- context = Whisper::VAD::Context.new(@model)
16
- params ||= Whisper::VAD::Params.new(
17
- threshold: 0.7,
18
- min_silence_duration_ms: 1000
19
- )
20
- waveform, sample_rate = TorchAudio.load(src)
16
+ def run(src, params: PARAMS)
17
+ src = src[0] if src.ndim == 2 && src.shape[0] == 1
21
18
  chunks = []
22
- context.detect(src.to_path, params).each do |segment|
19
+ @vad.segments_from_samples(params, src.to_ndav).each do |segment|
23
20
  segment => {start_time:, end_time:}
24
- st = start_time * sample_rate / 1000
25
- en = end_time * sample_rate / 1000
26
- chunks << waveform[0][st..en]
21
+ st = start_time * SAMPLE_RATE / 1000
22
+ en = end_time * SAMPLE_RATE / 1000
23
+ chunks << src[st..en]
27
24
  end
28
- output = Torch.cat(chunks)
29
- TorchAudio.save(dest.to_path, output.unsqueeze(0), sample_rate)
30
- dest
25
+ Torch.cat(chunks)
31
26
  end
32
27
  end
33
28
  end
data/lib/itak.rb CHANGED
@@ -1,10 +1,13 @@
1
1
  require "pathname"
2
- require "tmpdir"
2
+ require "torchaudio"
3
3
 
4
4
  require "itak/denoiser"
5
5
  require "itak/vad"
6
+ require "itak/transcriber"
6
7
 
7
8
  class Itak
9
+ SAMPLE_RATE = 16_000
10
+
8
11
  def run(src, dest=nil)
9
12
  src = Pathname(src)
10
13
  if dest
@@ -15,13 +18,30 @@ class Itak
15
18
  else
16
19
  dest = src.sub_ext(".denoised-vad" + src.extname)
17
20
  end
21
+ dest.dirname.mkpath
18
22
 
19
- Pathname.mktmpdir do |work_dir|
20
- # TODO: Use memory instead of files to pipeline data
21
- denoised = Denoiser.new(work_dir:).denoise(src)
22
- VAD.new.run(denoised, dest)
23
- end
23
+ waveform, sample_rate = TorchAudio.load(src.to_path)
24
+ raise "Currently only #{SAMPLE_RATE}Hz audio is supported" unless sample_rate == SAMPLE_RATE
25
+
26
+ $stderr.puts "Denoising..."
27
+ denoised = Denoiser.new.denoise(waveform)
28
+
29
+ $stderr.puts "VAD..."
30
+ vadded = VAD.new.run(denoised)
24
31
 
32
+ saving = Thread.new {
33
+ TorchAudio.save(dest.to_path, vadded.unsqueeze(0), SAMPLE_RATE)
34
+ $stderr.puts "Saved to #{dest}"
35
+ }
36
+ transcribing = Thread.new {
37
+ $stderr.puts "Transcribing..."
38
+ transcription = Transcriber.new.run(vadded)
39
+ transcription_path = dest.sub_ext(".txt")
40
+ transcription_path.write transcription
41
+ $stderr.puts "Transcription saved to #{transcription_path}"
42
+ }
43
+ saving.join
44
+ transcribing.join
25
45
  dest
26
46
  end
27
47
  end
data/test/test_itak.rb CHANGED
@@ -9,6 +9,7 @@ class TestItak < TestBase
9
9
  output = Itak.new.run(@src)
10
10
  assert_equal Pathname("test/fixtures/mix.denoised-vad.wav").expand_path, output
11
11
  assert_path_exist output
12
+ assert_path_exist output.sub_ext(".txt")
12
13
 
13
14
  src_reader = WaveFile::Reader.new(@src)
14
15
  src_reader.close
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: itak
3
3
  version: !ruby/object:Gem::Version
4
- version: '1'
4
+ version: '3'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kitaiti Makoto
@@ -11,6 +11,48 @@ date: 1980-01-02 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: gtcrn
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: 0.0.2
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - ">="
24
+ - !ruby/object:Gem::Version
25
+ version: 0.0.2
26
+ - !ruby/object:Gem::Dependency
27
+ name: whispercpp
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.3.6
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: 1.3.6
40
+ - !ruby/object:Gem::Dependency
41
+ name: torchaudio
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: 0.5.0
47
+ type: :runtime
48
+ prerelease: false
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: 0.5.0
54
+ - !ruby/object:Gem::Dependency
55
+ name: torchcodec
14
56
  requirement: !ruby/object:Gem::Requirement
15
57
  requirements:
16
58
  - - ">="
@@ -24,7 +66,7 @@ dependencies:
24
66
  - !ruby/object:Gem::Version
25
67
  version: '0'
26
68
  - !ruby/object:Gem::Dependency
27
- name: whispercpp
69
+ name: torch-rb
28
70
  requirement: !ruby/object:Gem::Requirement
29
71
  requirements:
30
72
  - - ">="
@@ -38,7 +80,21 @@ dependencies:
38
80
  - !ruby/object:Gem::Version
39
81
  version: '0'
40
82
  - !ruby/object:Gem::Dependency
41
- name: torchaudio
83
+ name: optparse-pathname
84
+ requirement: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ type: :runtime
90
+ prerelease: false
91
+ version_requirements: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ version: '0'
96
+ - !ruby/object:Gem::Dependency
97
+ name: ndav-torch-tensor
42
98
  requirement: !ruby/object:Gem::Requirement
43
99
  requirements:
44
100
  - - ">="
@@ -167,6 +223,7 @@ files:
167
223
  - itak.gemspec
168
224
  - lib/itak.rb
169
225
  - lib/itak/denoiser.rb
226
+ - lib/itak/transcriber.rb
170
227
  - lib/itak/vad.rb
171
228
  - test/fixtures/.gitkeep
172
229
  - test/helper.rb
@@ -189,7 +246,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
189
246
  - !ruby/object:Gem::Version
190
247
  version: '0'
191
248
  requirements: []
192
- rubygems_version: 4.0.3
249
+ rubygems_version: 4.0.6
193
250
  specification_version: 4
194
251
  summary: Audio editing tool for podcasters
195
252
  test_files: []