itak 1 → 3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +6 -1
- data/bin/itak +4 -1
- data/itak.gemspec +8 -4
- data/lib/itak/denoiser.rb +4 -7
- data/lib/itak/transcriber.rb +39 -0
- data/lib/itak/vad.rb +15 -20
- data/lib/itak.rb +26 -6
- data/test/test_itak.rb +1 -0
- metadata +61 -4
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: a7ab66466c640ba4b91cfdf41de938434bdf3decd225c606213ff0aa91a9cff2
|
|
4
|
+
data.tar.gz: 8eabbd49439e13cd6d1831a7fc5fd83a391538f89a06e9c69640ba3ec5006ace
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 0f7278553178a3d2fa2de089f14a71f5079814528772adccd5fd1897056b31fb5a624efe5b73c2a8779f6d3bb7e8ee213cea8aea8fc26a1f9984967b9b6a75c7
|
|
7
|
+
data.tar.gz: ed4f9e3765197050aa8061e52cad8cfcbed747f00c2433e59f80fa7ce1a715255be4ec0e36ecc7d78cae85dc7fc8463d4aa2fc7154eee3e12eda0b882b7d069c
|
data/README.md
CHANGED
data/bin/itak
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
require "optparse"
|
|
2
|
+
require "optparse/pathname"
|
|
2
3
|
require "itak"
|
|
3
4
|
|
|
4
5
|
def main(argv)
|
|
@@ -8,6 +9,8 @@ def main(argv)
|
|
|
8
9
|
output = Itak.new.run(src, options[:output])
|
|
9
10
|
$stderr.puts "Output written to"
|
|
10
11
|
$stderr.puts output
|
|
12
|
+
rescue => err
|
|
13
|
+
abort err.message
|
|
11
14
|
end
|
|
12
15
|
|
|
13
16
|
def parse_options(argv)
|
|
@@ -18,7 +21,7 @@ def parse_options(argv)
|
|
|
18
21
|
|
|
19
22
|
Usage: itak [options] INPUT
|
|
20
23
|
EOB
|
|
21
|
-
opt.on "-o", "--output=PATH", "Specify output file or directory" do |path|
|
|
24
|
+
opt.on "-o", "--output=PATH", "Specify output file or directory", Pathname do |path|
|
|
22
25
|
options[:output] = path
|
|
23
26
|
end
|
|
24
27
|
}.parse!(argv)
|
data/itak.gemspec
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Gem::Specification.new do |s|
|
|
2
2
|
s.name = "itak"
|
|
3
|
-
s.version = "
|
|
3
|
+
s.version = "3"
|
|
4
4
|
s.authors = ["Kitaiti Makoto"]
|
|
5
5
|
s.email = ["KitaitiMaokto@gmail.com"]
|
|
6
6
|
s.summary = "Audio editing tool for podcasters"
|
|
@@ -11,9 +11,13 @@ Gem::Specification.new do |s|
|
|
|
11
11
|
s.files = `git ls-files -z`.split("\x0")
|
|
12
12
|
s.executables = s.files.filter_map {|f| File.basename(f) if f.start_with?("bin/")}
|
|
13
13
|
|
|
14
|
-
s.add_runtime_dependency "gtcrn"
|
|
15
|
-
s.add_runtime_dependency "whispercpp"
|
|
16
|
-
s.add_runtime_dependency "torchaudio"
|
|
14
|
+
s.add_runtime_dependency "gtcrn", ">= 0.0.2"
|
|
15
|
+
s.add_runtime_dependency "whispercpp", ">= 1.3.6"
|
|
16
|
+
s.add_runtime_dependency "torchaudio", ">= 0.5.0"
|
|
17
|
+
s.add_runtime_dependency "torchcodec"
|
|
18
|
+
s.add_runtime_dependency "torch-rb"
|
|
19
|
+
s.add_runtime_dependency "optparse-pathname"
|
|
20
|
+
s.add_runtime_dependency "ndav-torch-tensor"
|
|
17
21
|
|
|
18
22
|
s.add_development_dependency "rake"
|
|
19
23
|
s.add_development_dependency "rubygems-tasks"
|
data/lib/itak/denoiser.rb
CHANGED
|
@@ -1,16 +1,13 @@
|
|
|
1
|
-
require "fileutils"
|
|
2
1
|
require "gtcrn"
|
|
3
2
|
|
|
4
3
|
class Itak
|
|
5
4
|
class Denoiser
|
|
6
|
-
def initialize
|
|
7
|
-
@
|
|
5
|
+
def initialize
|
|
6
|
+
@gtcrn = GTCRN.new
|
|
8
7
|
end
|
|
9
8
|
|
|
10
|
-
def denoise(src
|
|
11
|
-
|
|
12
|
-
dest ||= dir/src.sub_ext(".denoised" + src.extname).basename
|
|
13
|
-
GTCRN.new.enhance_speech(src, dest)
|
|
9
|
+
def denoise(src)
|
|
10
|
+
@gtcrn.enhance_speech_waveform(src)
|
|
14
11
|
end
|
|
15
12
|
end
|
|
16
13
|
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
require "whisper"
|
|
2
|
+
|
|
3
|
+
class Itak
|
|
4
|
+
class Transcriber
|
|
5
|
+
PARAMS = {
|
|
6
|
+
language: "ja",
|
|
7
|
+
temperature: 1.0
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
def initialize(model: "large-v3-turbo-q8_0")
|
|
11
|
+
Whisper.log_set proc {}, nil
|
|
12
|
+
@whisper = Whisper::Context.new(model)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def run(src, params: {})
|
|
16
|
+
params = Whisper::Params.new(**PARAMS.merge(params))
|
|
17
|
+
src = src[0] if src.ndim == 2 && src.shape[0] == 1
|
|
18
|
+
@whisper
|
|
19
|
+
.full(params, src.to_ndav)
|
|
20
|
+
.each_segment
|
|
21
|
+
.collect {|segment|
|
|
22
|
+
"[%<start_time>s --> %<end_time>s]%<text>s" % {
|
|
23
|
+
start_time: format_time(segment.start_time),
|
|
24
|
+
end_time: format_time(segment.end_time),
|
|
25
|
+
text: segment.text
|
|
26
|
+
}
|
|
27
|
+
}.join("\n\n")
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
private
|
|
31
|
+
|
|
32
|
+
def format_time(time_ms)
|
|
33
|
+
sec, decimal_part = time_ms.divmod(1000)
|
|
34
|
+
min, sec = sec.divmod(60)
|
|
35
|
+
hour, min = min.divmod(60)
|
|
36
|
+
"%02d:%02d:%02d" % [hour, min, sec]
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
data/lib/itak/vad.rb
CHANGED
|
@@ -1,33 +1,28 @@
|
|
|
1
|
-
require "pathname"
|
|
2
|
-
require "fileutils"
|
|
3
1
|
require "whisper"
|
|
4
|
-
require "
|
|
2
|
+
require "ndav/torch/tensor"
|
|
5
3
|
|
|
6
4
|
class Itak
|
|
7
5
|
class VAD
|
|
6
|
+
PARAMS = Whisper::VAD::Params.new(
|
|
7
|
+
threshold: 0.7,
|
|
8
|
+
min_silence_duration_ms: 1500
|
|
9
|
+
)
|
|
10
|
+
|
|
8
11
|
def initialize(model: "silero-v6.2.0")
|
|
9
|
-
|
|
12
|
+
Whisper.log_set proc {}, nil
|
|
13
|
+
@vad = Whisper::VAD::Context.new(model)
|
|
10
14
|
end
|
|
11
15
|
|
|
12
|
-
def run(src,
|
|
13
|
-
src =
|
|
14
|
-
dest = Pathname(dest)
|
|
15
|
-
context = Whisper::VAD::Context.new(@model)
|
|
16
|
-
params ||= Whisper::VAD::Params.new(
|
|
17
|
-
threshold: 0.7,
|
|
18
|
-
min_silence_duration_ms: 1000
|
|
19
|
-
)
|
|
20
|
-
waveform, sample_rate = TorchAudio.load(src)
|
|
16
|
+
def run(src, params: PARAMS)
|
|
17
|
+
src = src[0] if src.ndim == 2 && src.shape[0] == 1
|
|
21
18
|
chunks = []
|
|
22
|
-
|
|
19
|
+
@vad.segments_from_samples(params, src.to_ndav).each do |segment|
|
|
23
20
|
segment => {start_time:, end_time:}
|
|
24
|
-
st = start_time *
|
|
25
|
-
en = end_time *
|
|
26
|
-
chunks <<
|
|
21
|
+
st = start_time * SAMPLE_RATE / 1000
|
|
22
|
+
en = end_time * SAMPLE_RATE / 1000
|
|
23
|
+
chunks << src[st..en]
|
|
27
24
|
end
|
|
28
|
-
|
|
29
|
-
TorchAudio.save(dest.to_path, output.unsqueeze(0), sample_rate)
|
|
30
|
-
dest
|
|
25
|
+
Torch.cat(chunks)
|
|
31
26
|
end
|
|
32
27
|
end
|
|
33
28
|
end
|
data/lib/itak.rb
CHANGED
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
require "pathname"
|
|
2
|
-
require "
|
|
2
|
+
require "torchaudio"
|
|
3
3
|
|
|
4
4
|
require "itak/denoiser"
|
|
5
5
|
require "itak/vad"
|
|
6
|
+
require "itak/transcriber"
|
|
6
7
|
|
|
7
8
|
class Itak
|
|
9
|
+
SAMPLE_RATE = 16_000
|
|
10
|
+
|
|
8
11
|
def run(src, dest=nil)
|
|
9
12
|
src = Pathname(src)
|
|
10
13
|
if dest
|
|
@@ -15,13 +18,30 @@ class Itak
|
|
|
15
18
|
else
|
|
16
19
|
dest = src.sub_ext(".denoised-vad" + src.extname)
|
|
17
20
|
end
|
|
21
|
+
dest.dirname.mkpath
|
|
18
22
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
23
|
+
waveform, sample_rate = TorchAudio.load(src.to_path)
|
|
24
|
+
raise "Currently only #{SAMPLE_RATE}Hz audio is supported" unless sample_rate == SAMPLE_RATE
|
|
25
|
+
|
|
26
|
+
$stderr.puts "Denoising..."
|
|
27
|
+
denoised = Denoiser.new.denoise(waveform)
|
|
28
|
+
|
|
29
|
+
$stderr.puts "VAD..."
|
|
30
|
+
vadded = VAD.new.run(denoised)
|
|
24
31
|
|
|
32
|
+
saving = Thread.new {
|
|
33
|
+
TorchAudio.save(dest.to_path, vadded.unsqueeze(0), SAMPLE_RATE)
|
|
34
|
+
$stderr.puts "Saved to #{dest}"
|
|
35
|
+
}
|
|
36
|
+
transcribing = Thread.new {
|
|
37
|
+
$stderr.puts "Transcribing..."
|
|
38
|
+
transcription = Transcriber.new.run(vadded)
|
|
39
|
+
transcription_path = dest.sub_ext(".txt")
|
|
40
|
+
transcription_path.write transcription
|
|
41
|
+
$stderr.puts "Transcription saved to #{transcription_path}"
|
|
42
|
+
}
|
|
43
|
+
saving.join
|
|
44
|
+
transcribing.join
|
|
25
45
|
dest
|
|
26
46
|
end
|
|
27
47
|
end
|
data/test/test_itak.rb
CHANGED
|
@@ -9,6 +9,7 @@ class TestItak < TestBase
|
|
|
9
9
|
output = Itak.new.run(@src)
|
|
10
10
|
assert_equal Pathname("test/fixtures/mix.denoised-vad.wav").expand_path, output
|
|
11
11
|
assert_path_exist output
|
|
12
|
+
assert_path_exist output.sub_ext(".txt")
|
|
12
13
|
|
|
13
14
|
src_reader = WaveFile::Reader.new(@src)
|
|
14
15
|
src_reader.close
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: itak
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: '
|
|
4
|
+
version: '3'
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Kitaiti Makoto
|
|
@@ -11,6 +11,48 @@ date: 1980-01-02 00:00:00.000000000 Z
|
|
|
11
11
|
dependencies:
|
|
12
12
|
- !ruby/object:Gem::Dependency
|
|
13
13
|
name: gtcrn
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - ">="
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: 0.0.2
|
|
19
|
+
type: :runtime
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - ">="
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: 0.0.2
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: whispercpp
|
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - ">="
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: 1.3.6
|
|
33
|
+
type: :runtime
|
|
34
|
+
prerelease: false
|
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - ">="
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: 1.3.6
|
|
40
|
+
- !ruby/object:Gem::Dependency
|
|
41
|
+
name: torchaudio
|
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
|
43
|
+
requirements:
|
|
44
|
+
- - ">="
|
|
45
|
+
- !ruby/object:Gem::Version
|
|
46
|
+
version: 0.5.0
|
|
47
|
+
type: :runtime
|
|
48
|
+
prerelease: false
|
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - ">="
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: 0.5.0
|
|
54
|
+
- !ruby/object:Gem::Dependency
|
|
55
|
+
name: torchcodec
|
|
14
56
|
requirement: !ruby/object:Gem::Requirement
|
|
15
57
|
requirements:
|
|
16
58
|
- - ">="
|
|
@@ -24,7 +66,7 @@ dependencies:
|
|
|
24
66
|
- !ruby/object:Gem::Version
|
|
25
67
|
version: '0'
|
|
26
68
|
- !ruby/object:Gem::Dependency
|
|
27
|
-
name:
|
|
69
|
+
name: torch-rb
|
|
28
70
|
requirement: !ruby/object:Gem::Requirement
|
|
29
71
|
requirements:
|
|
30
72
|
- - ">="
|
|
@@ -38,7 +80,21 @@ dependencies:
|
|
|
38
80
|
- !ruby/object:Gem::Version
|
|
39
81
|
version: '0'
|
|
40
82
|
- !ruby/object:Gem::Dependency
|
|
41
|
-
name:
|
|
83
|
+
name: optparse-pathname
|
|
84
|
+
requirement: !ruby/object:Gem::Requirement
|
|
85
|
+
requirements:
|
|
86
|
+
- - ">="
|
|
87
|
+
- !ruby/object:Gem::Version
|
|
88
|
+
version: '0'
|
|
89
|
+
type: :runtime
|
|
90
|
+
prerelease: false
|
|
91
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
92
|
+
requirements:
|
|
93
|
+
- - ">="
|
|
94
|
+
- !ruby/object:Gem::Version
|
|
95
|
+
version: '0'
|
|
96
|
+
- !ruby/object:Gem::Dependency
|
|
97
|
+
name: ndav-torch-tensor
|
|
42
98
|
requirement: !ruby/object:Gem::Requirement
|
|
43
99
|
requirements:
|
|
44
100
|
- - ">="
|
|
@@ -167,6 +223,7 @@ files:
|
|
|
167
223
|
- itak.gemspec
|
|
168
224
|
- lib/itak.rb
|
|
169
225
|
- lib/itak/denoiser.rb
|
|
226
|
+
- lib/itak/transcriber.rb
|
|
170
227
|
- lib/itak/vad.rb
|
|
171
228
|
- test/fixtures/.gitkeep
|
|
172
229
|
- test/helper.rb
|
|
@@ -189,7 +246,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
189
246
|
- !ruby/object:Gem::Version
|
|
190
247
|
version: '0'
|
|
191
248
|
requirements: []
|
|
192
|
-
rubygems_version: 4.0.
|
|
249
|
+
rubygems_version: 4.0.6
|
|
193
250
|
specification_version: 4
|
|
194
251
|
summary: Audio editing tool for podcasters
|
|
195
252
|
test_files: []
|