speech2text 0.3.1 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/speech2text +5 -0
- data/lib/speech/audio_inspector.rb +1 -1
- data/lib/speech/audio_splitter.rb +20 -4
- data/lib/speech/audio_to_text.rb +13 -21
- data/lib/speech/version.rb +1 -1
- data/test/audio_to_text_test.rb +18 -0
- metadata +6 -6
data/bin/speech2text
CHANGED
@@ -14,7 +14,7 @@ module Speech
|
|
14
14
|
|
15
15
|
def to_s
|
16
16
|
s,f = seconds.split('.')
|
17
|
-
sprintf "%.2d:%.2d:%.2d:%.2d", self.hours.
|
17
|
+
sprintf "%.2d:%.2d:%.2d:%.2d", self.hours.to_i, self.minutes.to_i, s.to_i, (f||0).to_i
|
18
18
|
#"#{hours}:#{minutes}:#{seconds}:#{f}"
|
19
19
|
end
|
20
20
|
|
@@ -5,22 +5,33 @@ module Speech
|
|
5
5
|
attr_accessor :original_file, :size, :duration, :chunks
|
6
6
|
|
7
7
|
class AudioChunk
|
8
|
-
attr_accessor :splitter, :chunk, :flac_chunk, :offset, :duration, :flac_rate
|
8
|
+
attr_accessor :splitter, :chunk, :flac_chunk, :offset, :duration, :flac_rate, :copied
|
9
9
|
|
10
10
|
def initialize(splitter, offset, duration)
|
11
11
|
self.offset = offset
|
12
12
|
self.chunk = File.join(File.dirname(splitter.original_file), "chunk-" + File.basename(splitter.original_file).gsub(/\.(.*)$/, "-#{offset}" + '.\1'))
|
13
13
|
self.duration = duration
|
14
14
|
self.splitter = splitter
|
15
|
+
self.copied = false
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.copy(splitter)
|
19
|
+
chunk = AudioChunk.new(splitter, 0, splitter.duration.to_f)
|
20
|
+
chunk.copied = true
|
21
|
+
system("cp #{splitter.original_file} #{chunk.chunk}")
|
22
|
+
chunk
|
15
23
|
end
|
16
24
|
|
17
25
|
# given the original file from the splitter and the chunked file name with duration and offset run the ffmpeg command
|
18
26
|
def build
|
27
|
+
return self if self.copied
|
19
28
|
# ffmpeg -y -i sample.audio.wav -acodec copy -vcodec copy -ss 00:00:00:00 -t 00:00:30:00 sample.audio.out.wav
|
20
29
|
offset_ts = AudioInspector::Duration.from_seconds(self.offset)
|
21
30
|
duration_ts = AudioInspector::Duration.from_seconds(self.duration)
|
31
|
+
# NOTE: kind of a hack, but if the original source is less than or equal to 1 second, we should skip ffmpeg
|
32
|
+
puts "building chunk: #{duration_ts.inspect} and offset: #{offset_ts}"
|
22
33
|
#puts "offset: #{ offset_ts.to_s }, duration: #{duration_ts.to_s}"
|
23
|
-
cmd = "ffmpeg -y -i #{splitter.original_file} -acodec copy -vcodec copy -ss #{offset_ts} -t #{duration_ts} #{self.chunk} >/dev/null 2>&1"
|
34
|
+
cmd = "ffmpeg -y -i #{splitter.original_file} -acodec copy -vcodec copy -ss #{offset_ts} -t #{duration_ts} #{self.chunk}"# >/dev/null 2>&1"
|
24
35
|
if system(cmd)
|
25
36
|
self
|
26
37
|
else
|
@@ -30,7 +41,9 @@ module Speech
|
|
30
41
|
|
31
42
|
# convert the audio file to flac format
|
32
43
|
def to_flac
|
33
|
-
|
44
|
+
puts "convert: #{chunk} to flac"
|
45
|
+
if system("flac #{chunk}")
|
46
|
+
puts "success?"
|
34
47
|
self.flac_chunk = chunk.gsub(File.extname(chunk), ".flac")
|
35
48
|
# convert the audio file to 16K
|
36
49
|
self.flac_rate = `ffmpeg -i #{self.flac_chunk} 2>&1`.strip.scan(/Audio: flac, (.*) Hz/).first.first.strip
|
@@ -42,6 +55,8 @@ module Speech
|
|
42
55
|
raise "failed to convert to lower audio rate"
|
43
56
|
end
|
44
57
|
|
58
|
+
else
|
59
|
+
raise "failed to convert chunk: #{chunk} with flac #{chunk}"
|
45
60
|
end
|
46
61
|
end
|
47
62
|
|
@@ -75,10 +90,11 @@ module Speech
|
|
75
90
|
end
|
76
91
|
|
77
92
|
if chunks.empty?
|
78
|
-
chunks << AudioChunk.
|
93
|
+
chunks << AudioChunk.copy(self)#, 0, self.duration.to_f)
|
79
94
|
else
|
80
95
|
chunks << AudioChunk.new(self, chunks.last.offset.to_i + chunks.last.duration.to_i, self.size + last_chunk)
|
81
96
|
end
|
97
|
+
puts "Chunk count: #{chunks.size}"
|
82
98
|
|
83
99
|
chunks
|
84
100
|
end
|
data/lib/speech/audio_to_text.rb
CHANGED
@@ -2,17 +2,16 @@
|
|
2
2
|
module Speech
|
3
3
|
|
4
4
|
class AudioToText
|
5
|
-
attr_accessor :file, :rate, :captured_json, :
|
5
|
+
attr_accessor :file, :rate, :captured_json, :captured_file
|
6
6
|
|
7
7
|
def initialize(file)
|
8
8
|
self.file = file
|
9
9
|
self.captured_file = self.file.gsub(/\.wav$/,'.json')
|
10
|
-
self.captured_json =
|
11
|
-
self.confidence = 0.0
|
10
|
+
self.captured_json = {}
|
12
11
|
end
|
13
12
|
|
14
13
|
def to_text
|
15
|
-
url = "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=speech2text&lang=en-US&maxresults=
|
14
|
+
url = "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=speech2text&lang=en-US&maxresults=10"
|
16
15
|
splitter = Speech::AudioSplitter.new(file) # based off the wave file because flac doesn't tell us the duration
|
17
16
|
easy = Curl::Easy.new(url)
|
18
17
|
splitter.split.each do|chunk|
|
@@ -31,10 +30,12 @@ module Speech
|
|
31
30
|
def convert_chunk(easy, chunk, options={})
|
32
31
|
puts "sending chunk of size #{chunk.duration}..."
|
33
32
|
retrying = true
|
34
|
-
|
33
|
+
retry_count = 0
|
34
|
+
while retrying && retry_count < 5
|
35
35
|
#easy.verbose = true
|
36
36
|
easy.headers['Content-Type'] = "audio/x-flac; rate=#{chunk.flac_rate}"
|
37
37
|
easy.headers['User-Agent'] = "https://github.com/taf2/speech2text"
|
38
|
+
#puts chunk.inspect
|
38
39
|
easy.post_body = "Content=#{chunk.to_flac_bytes}"
|
39
40
|
easy.on_progress {|dl_total, dl_now, ul_total, ul_now| printf("%.2f/%.2f\r", ul_now, ul_total); true }
|
40
41
|
easy.on_complete {|easy| puts }
|
@@ -44,25 +45,16 @@ module Speech
|
|
44
45
|
if easy.response_code == 500
|
45
46
|
puts "500 from google retry after 0.5 seconds"
|
46
47
|
retrying = true
|
47
|
-
|
48
|
+
retry_count += 1
|
49
|
+
sleep 0.5 # wait longer on error?, google??
|
48
50
|
else
|
49
51
|
# {"status":0,"id":"ce178ea89f8b17d8e8298c9c7814700a-1","hypotheses":[{"utterance":"I like pickles","confidence":0.92731786}]}
|
50
52
|
data = JSON.parse(easy.body_str)
|
51
|
-
|
52
|
-
data['
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
}
|
57
|
-
File.open("#{self.captured_file}", "wb") {|f|
|
58
|
-
size = self.captured_json.size
|
59
|
-
if size > 0
|
60
|
-
confidence_calc = self.confidence / size
|
61
|
-
else
|
62
|
-
confidence_calc = 0
|
63
|
-
end
|
64
|
-
f << {:captured_json => captured_json, :confidence => confidence_calc}.to_json
|
65
|
-
}
|
53
|
+
self.captured_json['status'] = data['status']
|
54
|
+
self.captured_json['id'] = data['id']
|
55
|
+
self.captured_json['hypotheses'] = data['hypotheses'].map {|ut| [ut['utterance'], ut['confidence']] }
|
56
|
+
puts self.captured_json.inspect
|
57
|
+
File.open("#{self.captured_file}", "wb") {|f| f << captured_json.to_json }
|
66
58
|
retrying = false
|
67
59
|
end
|
68
60
|
sleep 0.1 # not too fast there tiger
|
data/lib/speech/version.rb
CHANGED
data/test/audio_to_text_test.rb
CHANGED
@@ -18,4 +18,22 @@ class SpeechAudioToTextTest < Test::Unit::TestCase
|
|
18
18
|
ensure
|
19
19
|
audio.clean
|
20
20
|
end
|
21
|
+
|
22
|
+
def test_short_audio_clip
|
23
|
+
audio = Speech::AudioToText.new("samples/i-like-pickles.chunk5.wav")
|
24
|
+
captured_json = audio.to_text
|
25
|
+
assert captured_json
|
26
|
+
assert captured_json.key?("hypotheses")
|
27
|
+
assert !captured_json['hypotheses'].empty?
|
28
|
+
#{"status"=>0, "id"=>"552de5ba35bb769ce3493ff113e158a8-1", "hypotheses"=>[["eagles", 0.7214844], ["pickles", nil], ["michaels", nil], ["giggles", nil], ["tickles", nil]]}
|
29
|
+
assert captured_json.keys.include?('status')
|
30
|
+
assert captured_json.keys.include?('id')
|
31
|
+
assert captured_json.keys.include?('hypotheses')
|
32
|
+
puts captured_json.inspect
|
33
|
+
assert_equal "eagles", captured_json['hypotheses'][0].first
|
34
|
+
assert_equal "pickles", captured_json['hypotheses'][1].first
|
35
|
+
#assert captured_json['confidence'] > 0.9
|
36
|
+
ensure
|
37
|
+
audio.clean
|
38
|
+
end
|
21
39
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: speech2text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,12 +9,12 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-04-
|
12
|
+
date: 2011-04-04 00:00:00.000000000 -04:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: curb
|
17
|
-
requirement: &
|
17
|
+
requirement: &2163572140 !ruby/object:Gem::Requirement
|
18
18
|
none: false
|
19
19
|
requirements:
|
20
20
|
- - ! '>='
|
@@ -22,10 +22,10 @@ dependencies:
|
|
22
22
|
version: '0'
|
23
23
|
type: :runtime
|
24
24
|
prerelease: false
|
25
|
-
version_requirements: *
|
25
|
+
version_requirements: *2163572140
|
26
26
|
- !ruby/object:Gem::Dependency
|
27
27
|
name: json
|
28
|
-
requirement: &
|
28
|
+
requirement: &2163571700 !ruby/object:Gem::Requirement
|
29
29
|
none: false
|
30
30
|
requirements:
|
31
31
|
- - ! '>='
|
@@ -33,7 +33,7 @@ dependencies:
|
|
33
33
|
version: '0'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
|
-
version_requirements: *
|
36
|
+
version_requirements: *2163571700
|
37
37
|
description: Super powers of Google wrapped in a nice Ruby interface
|
38
38
|
email: todd.fisher@gmail.com
|
39
39
|
executables:
|