speech2text 0.3.4 → 0.3.6
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/speech/audio_inspector.rb +8 -2
- data/lib/speech/audio_splitter.rb +17 -12
- data/lib/speech/audio_to_text.rb +38 -21
- data/lib/speech/version.rb +1 -1
- data/test/audio_splitter_test.rb +3 -3
- data/test/audio_to_text_test.rb +11 -31
- metadata +19 -10
- data/test/samples/i-like-pickles.json +0 -1
@@ -14,7 +14,7 @@ module Speech
|
|
14
14
|
|
15
15
|
def to_s
|
16
16
|
s,f = seconds.split('.')
|
17
|
-
sprintf "%.2d:%.2d:%.2d
|
17
|
+
sprintf "%.2d:%.2d:%.2d.%.2d", self.hours.to_i, self.minutes.to_i, s.to_i, (f||0).to_i
|
18
18
|
#"#{hours}:#{minutes}:#{seconds}:#{f}"
|
19
19
|
end
|
20
20
|
|
@@ -47,7 +47,13 @@ module Speech
|
|
47
47
|
end
|
48
48
|
|
49
49
|
def initialize(file)
|
50
|
-
|
50
|
+
out = `ffmpeg -i #{file} 2>&1`.strip
|
51
|
+
if out.match(/No such file or directory/)
|
52
|
+
raise "No such file or directory: #{file}"
|
53
|
+
else
|
54
|
+
out = out.scan(/Duration: (.*),/)
|
55
|
+
self.duration = Duration.new(out.first.first)
|
56
|
+
end
|
51
57
|
end
|
52
58
|
|
53
59
|
end
|
@@ -25,13 +25,13 @@ module Speech
|
|
25
25
|
# given the original file from the splitter and the chunked file name with duration and offset run the ffmpeg command
|
26
26
|
def build
|
27
27
|
return self if self.copied
|
28
|
-
# ffmpeg -y -i sample.audio.wav -acodec copy -vcodec copy -ss 00:00:00
|
29
|
-
offset_ts = AudioInspector::Duration.from_seconds(self.offset)
|
30
|
-
duration_ts = AudioInspector::Duration.from_seconds(self.duration)
|
28
|
+
# ffmpeg -y -i sample.audio.wav -acodec copy -vcodec copy -ss 00:00:00.00 -t 00:00:30.00 sample.audio.out.wav
|
29
|
+
offset_ts = AudioInspector::Duration.from_seconds(self.offset).to_s
|
30
|
+
duration_ts = AudioInspector::Duration.from_seconds(self.duration).to_s
|
31
31
|
# NOTE: kind of a hack, but if the original source is less than or equal to 1 second, we should skip ffmpeg
|
32
|
-
puts "building chunk: #{duration_ts.inspect} and offset: #{offset_ts}"
|
32
|
+
#puts "building chunk: #{duration_ts.inspect} and offset: #{offset_ts}"
|
33
33
|
#puts "offset: #{ offset_ts.to_s }, duration: #{duration_ts.to_s}"
|
34
|
-
cmd = "ffmpeg -y -i #{splitter.original_file} -acodec copy -vcodec copy -ss #{offset_ts} -t #{duration_ts} #{self.chunk}
|
34
|
+
cmd = "ffmpeg -y -i #{splitter.original_file} -acodec copy -vcodec copy -ss #{offset_ts} -t #{duration_ts} #{self.chunk} >/dev/null 2>&1"
|
35
35
|
if system(cmd)
|
36
36
|
self
|
37
37
|
else
|
@@ -41,10 +41,9 @@ module Speech
|
|
41
41
|
|
42
42
|
# convert the audio file to flac format
|
43
43
|
def to_flac
|
44
|
-
|
45
|
-
if system("
|
46
|
-
|
47
|
-
self.flac_chunk = chunk.gsub(File.extname(chunk), ".flac")
|
44
|
+
chunk_outputfile = chunk.gsub(/#{File.extname(chunk)}$/, ".flac")
|
45
|
+
if system("ffmpeg -i #{chunk} -acodec flac #{chunk_outputfile} >/dev/null 2>&1")
|
46
|
+
self.flac_chunk = chunk.gsub(/#{File.extname(chunk)}$/, ".flac")
|
48
47
|
# convert the audio file to 16K
|
49
48
|
self.flac_rate = `ffmpeg -i #{self.flac_chunk} 2>&1`.strip.scan(/Audio: flac, (.*) Hz/).first.first.strip
|
50
49
|
down_sampled = self.flac_chunk.gsub(/\.flac$/, '-sampled.flac')
|
@@ -72,7 +71,7 @@ module Speech
|
|
72
71
|
|
73
72
|
end
|
74
73
|
|
75
|
-
def initialize(file, chunk_size=
|
74
|
+
def initialize(file, chunk_size=5)
|
76
75
|
self.original_file = file
|
77
76
|
self.duration = AudioInspector.new(file).duration
|
78
77
|
self.size = chunk_size
|
@@ -86,7 +85,13 @@ module Speech
|
|
86
85
|
#puts "generate: #{full_chunks} chunks of #{size} seconds, last: #{last_chunk} seconds"
|
87
86
|
|
88
87
|
(full_chunks-1).times do|chunkid|
|
89
|
-
|
88
|
+
if chunkid > 0
|
89
|
+
chunks << AudioChunk.new(self, chunkid * self.size, self.size)
|
90
|
+
else
|
91
|
+
off = (chunkid * self.size)-(self.size/2)
|
92
|
+
off = 0 if off < 0
|
93
|
+
chunks << AudioChunk.new(self, off, self.size)
|
94
|
+
end
|
90
95
|
end
|
91
96
|
|
92
97
|
if chunks.empty?
|
@@ -94,7 +99,7 @@ module Speech
|
|
94
99
|
else
|
95
100
|
chunks << AudioChunk.new(self, chunks.last.offset.to_i + chunks.last.duration.to_i, self.size + last_chunk)
|
96
101
|
end
|
97
|
-
puts "Chunk count: #{chunks.size}"
|
102
|
+
#puts "Chunk count: #{chunks.size}"
|
98
103
|
|
99
104
|
chunks
|
100
105
|
end
|
data/lib/speech/audio_to_text.rb
CHANGED
@@ -2,63 +2,80 @@
|
|
2
2
|
module Speech
|
3
3
|
|
4
4
|
class AudioToText
|
5
|
-
attr_accessor :file, :rate, :captured_json
|
5
|
+
attr_accessor :file, :rate, :captured_json
|
6
|
+
attr_accessor :best_match_text, :score, :verbose, :segments
|
6
7
|
|
7
|
-
def initialize(file)
|
8
|
+
def initialize(file, options={})
|
9
|
+
self.verbose = false
|
8
10
|
self.file = file
|
9
|
-
self.captured_file = self.file.gsub(/\.wav$/,'.json')
|
10
11
|
self.captured_json = {}
|
12
|
+
self.best_match_text = ""
|
13
|
+
self.score = 0.0
|
14
|
+
self.segments = 0
|
15
|
+
|
16
|
+
self.verbose = !!options[:verbose] if options.key?(:verbose)
|
17
|
+
end
|
18
|
+
|
19
|
+
def to_text(max=2,lang="en-US")
|
20
|
+
to_json(max,lang)
|
21
|
+
self.best_match_text
|
11
22
|
end
|
12
23
|
|
13
|
-
def
|
14
|
-
|
24
|
+
def to_json(max=2,lang="en-US")
|
25
|
+
self.best_match_text = ""
|
26
|
+
self.score = 0.0
|
27
|
+
self.segments = 0
|
28
|
+
|
29
|
+
url = "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=speech2text&lang=#{lang}&maxresults=#{max}"
|
15
30
|
splitter = Speech::AudioSplitter.new(file) # based off the wave file because flac doesn't tell us the duration
|
16
31
|
easy = Curl::Easy.new(url)
|
17
32
|
splitter.split.each do|chunk|
|
18
33
|
chunk.build.to_flac
|
19
34
|
convert_chunk(easy, chunk)
|
20
35
|
end
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
def clean
|
25
|
-
File.unlink self.captured_file if self.captured_file && File.exist?(self.captured_file)
|
36
|
+
self.best_match_text = self.best_match_text.strip
|
37
|
+
self.score /= self.segments
|
38
|
+
self.captured_json
|
26
39
|
end
|
27
40
|
|
28
41
|
protected
|
29
42
|
|
30
43
|
def convert_chunk(easy, chunk, options={})
|
31
|
-
puts "sending chunk of size #{chunk.duration}..."
|
44
|
+
puts "sending chunk of size #{chunk.duration}..." if self.verbose
|
32
45
|
retrying = true
|
33
46
|
retry_count = 0
|
34
|
-
while retrying && retry_count <
|
35
|
-
|
47
|
+
while retrying && retry_count < 3 # 3 retries
|
48
|
+
easy.verbose = self.verbose
|
36
49
|
easy.headers['Content-Type'] = "audio/x-flac; rate=#{chunk.flac_rate}"
|
37
50
|
easy.headers['User-Agent'] = "https://github.com/taf2/speech2text"
|
38
|
-
#puts chunk.inspect
|
39
51
|
easy.post_body = "Content=#{chunk.to_flac_bytes}"
|
40
|
-
|
52
|
+
if self.verbose
|
53
|
+
easy.on_progress {|dl_total, dl_now, ul_total, ul_now| printf("%.2f/%.2f\r", ul_now, ul_total); true }
|
54
|
+
end
|
41
55
|
easy.on_complete {|easy| puts }
|
42
56
|
easy.http_post
|
43
|
-
#puts easy.header_str
|
44
|
-
#puts easy.body_str
|
45
57
|
if easy.response_code == 500
|
46
|
-
puts "500 from google retry after 0.5 seconds"
|
58
|
+
puts "500 from google retry after 0.5 seconds" if self.verbose
|
47
59
|
retrying = true
|
48
60
|
retry_count += 1
|
49
61
|
sleep 0.5 # wait longer on error?, google??
|
50
62
|
else
|
51
|
-
# {"status":0,"id":"ce178ea89f8b17d8e8298c9c7814700a-1","hypotheses":[{"utterance"
|
63
|
+
# {"status":0,"id":"ce178ea89f8b17d8e8298c9c7814700a-1","hypotheses":[{"utterance"=>"I like pickles", "confidence"=>0.59408695}, {"utterance"=>"I like turtles"}, {"utterance"=>"I like tickles"}, {"utterance"=>"I like to Kohl's"}, {"utterance"=>"I Like tickles"}, {"utterance"=>"I lyk tickles"}, {"utterance"=>"I liked to Kohl's"}]}
|
52
64
|
data = JSON.parse(easy.body_str)
|
53
65
|
self.captured_json['status'] = data['status']
|
54
66
|
self.captured_json['id'] = data['id']
|
55
67
|
self.captured_json['hypotheses'] = data['hypotheses'].map {|ut| [ut['utterance'], ut['confidence']] }
|
56
|
-
|
57
|
-
|
68
|
+
if data.key?('hypotheses') && ['hypotheses'].first
|
69
|
+
self.best_match_text += " " + data['hypotheses'].first['utterance']
|
70
|
+
self.score += data['hypotheses'].first['confidence']
|
71
|
+
self.segments += 1
|
72
|
+
end
|
58
73
|
retrying = false
|
59
74
|
end
|
60
75
|
sleep 0.1 # not too fast there tiger
|
61
76
|
end
|
77
|
+
puts "#{segments} processed: #{self.captured_json.inspect}" if self.verbose
|
78
|
+
self.captured_json
|
62
79
|
ensure
|
63
80
|
chunk.clean
|
64
81
|
end
|
data/lib/speech/version.rb
CHANGED
data/test/audio_splitter_test.rb
CHANGED
@@ -6,10 +6,10 @@ require 'speech'
|
|
6
6
|
class SpeechAudioSplitterTest < Test::Unit::TestCase
|
7
7
|
|
8
8
|
def test_audio_splitter
|
9
|
-
splitter = Speech::AudioSplitter.new("samples/i-like-pickles.wav", 1)
|
9
|
+
splitter = Speech::AudioSplitter.new(File.expand_path(File.join(File.dirname(__FILE__),"samples/i-like-pickles.wav")), 1)
|
10
10
|
|
11
|
-
assert_equal '00:00:03
|
12
|
-
assert_equal 3.
|
11
|
+
assert_equal '00:00:03.51', splitter.duration.to_s
|
12
|
+
assert_equal 3.51, splitter.duration.to_f
|
13
13
|
|
14
14
|
chunks = splitter.split
|
15
15
|
assert_equal 3, chunks.size
|
data/test/audio_to_text_test.rb
CHANGED
@@ -4,39 +4,19 @@ $:.unshift File.expand_path(File.dirname(__FILE__) + '/../lib')
|
|
4
4
|
require 'speech'
|
5
5
|
|
6
6
|
class SpeechAudioToTextTest < Test::Unit::TestCase
|
7
|
-
def
|
8
|
-
|
9
|
-
|
10
|
-
assert captured_json
|
11
|
-
assert captured_json.key?("hypotheses")
|
12
|
-
assert !captured_json['hypotheses'].empty?
|
13
|
-
assert captured_json.keys.include?('status')
|
14
|
-
assert captured_json.keys.include?('id')
|
15
|
-
assert captured_json.keys.include?('hypotheses')
|
7
|
+
def setup
|
8
|
+
super
|
9
|
+
end
|
16
10
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
# puts captured_json.inspect
|
21
|
-
ensure
|
22
|
-
audio.clean
|
11
|
+
def test_audio_to_text
|
12
|
+
audio = Speech::AudioToText.new(File.expand_path(File.join(File.dirname(__FILE__),"samples/i-like-pickles.wav")))
|
13
|
+
assert_equal "I like pickles", audio.to_text
|
23
14
|
end
|
24
15
|
|
25
|
-
def
|
26
|
-
audio = Speech::AudioToText.new("
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
assert !captured_json['hypotheses'].empty?
|
31
|
-
#{"status"=>0, "id"=>"552de5ba35bb769ce3493ff113e158a8-1", "hypotheses"=>[["eagles", 0.7214844], ["pickles", nil], ["michaels", nil], ["giggles", nil], ["tickles", nil]]}
|
32
|
-
assert captured_json.keys.include?('status')
|
33
|
-
assert captured_json.keys.include?('id')
|
34
|
-
assert captured_json.keys.include?('hypotheses')
|
35
|
-
puts captured_json.inspect
|
36
|
-
assert_equal "eagles", captured_json['hypotheses'][0].first
|
37
|
-
assert_equal "pickles", captured_json['hypotheses'][1].first
|
38
|
-
#assert captured_json['confidence'] > 0.9
|
39
|
-
ensure
|
40
|
-
audio.clean
|
16
|
+
def test_longer_audio
|
17
|
+
audio = Speech::AudioToText.new(File.expand_path(File.join(File.dirname(__FILE__),"/SampleAudio.wav")), :verbose => true)
|
18
|
+
puts audio.to_text
|
19
|
+
puts audio.score
|
20
|
+
puts audio.segments
|
41
21
|
end
|
42
22
|
end
|
metadata
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: speech2text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: !binary |-
|
5
|
+
MC4zLjY=
|
5
6
|
prerelease:
|
6
7
|
platform: ruby
|
7
8
|
authors:
|
@@ -9,12 +10,11 @@ authors:
|
|
9
10
|
autorequire:
|
10
11
|
bindir: bin
|
11
12
|
cert_chain: []
|
12
|
-
date:
|
13
|
-
default_executable:
|
13
|
+
date: 2012-10-07 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: curb
|
17
|
-
requirement:
|
17
|
+
requirement: !ruby/object:Gem::Requirement
|
18
18
|
none: false
|
19
19
|
requirements:
|
20
20
|
- - ! '>='
|
@@ -22,10 +22,15 @@ dependencies:
|
|
22
22
|
version: '0'
|
23
23
|
type: :runtime
|
24
24
|
prerelease: false
|
25
|
-
version_requirements:
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
none: false
|
27
|
+
requirements:
|
28
|
+
- - ! '>='
|
29
|
+
- !ruby/object:Gem::Version
|
30
|
+
version: '0'
|
26
31
|
- !ruby/object:Gem::Dependency
|
27
32
|
name: json
|
28
|
-
requirement:
|
33
|
+
requirement: !ruby/object:Gem::Requirement
|
29
34
|
none: false
|
30
35
|
requirements:
|
31
36
|
- - ! '>='
|
@@ -33,7 +38,12 @@ dependencies:
|
|
33
38
|
version: '0'
|
34
39
|
type: :runtime
|
35
40
|
prerelease: false
|
36
|
-
version_requirements:
|
41
|
+
version_requirements: !ruby/object:Gem::Requirement
|
42
|
+
none: false
|
43
|
+
requirements:
|
44
|
+
- - ! '>='
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '0'
|
37
47
|
description: Super powers of Google wrapped in a nice Ruby interface
|
38
48
|
email: todd.fisher@gmail.com
|
39
49
|
executables:
|
@@ -51,12 +61,10 @@ files:
|
|
51
61
|
- test/audio_splitter_test.rb
|
52
62
|
- test/audio_to_text_test.rb
|
53
63
|
- test/SampleAudio.wav
|
54
|
-
- test/samples/i-like-pickles.json
|
55
64
|
- test/samples/i-like-pickles.wav
|
56
65
|
- Rakefile
|
57
66
|
- README.rdoc
|
58
67
|
- speech2text.gemspec
|
59
|
-
has_rdoc: true
|
60
68
|
homepage: https://github.com/taf2/speech2text
|
61
69
|
licenses: []
|
62
70
|
post_install_message:
|
@@ -77,8 +85,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
77
85
|
version: '0'
|
78
86
|
requirements: []
|
79
87
|
rubyforge_project:
|
80
|
-
rubygems_version: 1.
|
88
|
+
rubygems_version: 1.8.24
|
81
89
|
signing_key:
|
82
90
|
specification_version: 3
|
83
91
|
summary: Speech to Text Library
|
84
92
|
test_files: []
|
93
|
+
has_rdoc:
|
@@ -1 +0,0 @@
|
|
1
|
-
{"captured_json":[["I like pickles",0.92731786]],"confidence":0.92731786}
|