speech2text 0.3.4 → 0.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/speech/audio_inspector.rb +8 -2
- data/lib/speech/audio_splitter.rb +17 -12
- data/lib/speech/audio_to_text.rb +38 -21
- data/lib/speech/version.rb +1 -1
- data/test/audio_splitter_test.rb +3 -3
- data/test/audio_to_text_test.rb +11 -31
- metadata +19 -10
- data/test/samples/i-like-pickles.json +0 -1
@@ -14,7 +14,7 @@ module Speech
|
|
14
14
|
|
15
15
|
def to_s
|
16
16
|
s,f = seconds.split('.')
|
17
|
-
sprintf "%.2d:%.2d:%.2d
|
17
|
+
sprintf "%.2d:%.2d:%.2d.%.2d", self.hours.to_i, self.minutes.to_i, s.to_i, (f||0).to_i
|
18
18
|
#"#{hours}:#{minutes}:#{seconds}:#{f}"
|
19
19
|
end
|
20
20
|
|
@@ -47,7 +47,13 @@ module Speech
|
|
47
47
|
end
|
48
48
|
|
49
49
|
def initialize(file)
|
50
|
-
|
50
|
+
out = `ffmpeg -i #{file} 2>&1`.strip
|
51
|
+
if out.match(/No such file or directory/)
|
52
|
+
raise "No such file or directory: #{file}"
|
53
|
+
else
|
54
|
+
out = out.scan(/Duration: (.*),/)
|
55
|
+
self.duration = Duration.new(out.first.first)
|
56
|
+
end
|
51
57
|
end
|
52
58
|
|
53
59
|
end
|
@@ -25,13 +25,13 @@ module Speech
|
|
25
25
|
# given the original file from the splitter and the chunked file name with duration and offset run the ffmpeg command
|
26
26
|
def build
|
27
27
|
return self if self.copied
|
28
|
-
# ffmpeg -y -i sample.audio.wav -acodec copy -vcodec copy -ss 00:00:00
|
29
|
-
offset_ts = AudioInspector::Duration.from_seconds(self.offset)
|
30
|
-
duration_ts = AudioInspector::Duration.from_seconds(self.duration)
|
28
|
+
# ffmpeg -y -i sample.audio.wav -acodec copy -vcodec copy -ss 00:00:00.00 -t 00:00:30.00 sample.audio.out.wav
|
29
|
+
offset_ts = AudioInspector::Duration.from_seconds(self.offset).to_s
|
30
|
+
duration_ts = AudioInspector::Duration.from_seconds(self.duration).to_s
|
31
31
|
# NOTE: kind of a hack, but if the original source is less than or equal to 1 second, we should skip ffmpeg
|
32
|
-
puts "building chunk: #{duration_ts.inspect} and offset: #{offset_ts}"
|
32
|
+
#puts "building chunk: #{duration_ts.inspect} and offset: #{offset_ts}"
|
33
33
|
#puts "offset: #{ offset_ts.to_s }, duration: #{duration_ts.to_s}"
|
34
|
-
cmd = "ffmpeg -y -i #{splitter.original_file} -acodec copy -vcodec copy -ss #{offset_ts} -t #{duration_ts} #{self.chunk}
|
34
|
+
cmd = "ffmpeg -y -i #{splitter.original_file} -acodec copy -vcodec copy -ss #{offset_ts} -t #{duration_ts} #{self.chunk} >/dev/null 2>&1"
|
35
35
|
if system(cmd)
|
36
36
|
self
|
37
37
|
else
|
@@ -41,10 +41,9 @@ module Speech
|
|
41
41
|
|
42
42
|
# convert the audio file to flac format
|
43
43
|
def to_flac
|
44
|
-
|
45
|
-
if system("
|
46
|
-
|
47
|
-
self.flac_chunk = chunk.gsub(File.extname(chunk), ".flac")
|
44
|
+
chunk_outputfile = chunk.gsub(/#{File.extname(chunk)}$/, ".flac")
|
45
|
+
if system("ffmpeg -i #{chunk} -acodec flac #{chunk_outputfile} >/dev/null 2>&1")
|
46
|
+
self.flac_chunk = chunk.gsub(/#{File.extname(chunk)}$/, ".flac")
|
48
47
|
# convert the audio file to 16K
|
49
48
|
self.flac_rate = `ffmpeg -i #{self.flac_chunk} 2>&1`.strip.scan(/Audio: flac, (.*) Hz/).first.first.strip
|
50
49
|
down_sampled = self.flac_chunk.gsub(/\.flac$/, '-sampled.flac')
|
@@ -72,7 +71,7 @@ module Speech
|
|
72
71
|
|
73
72
|
end
|
74
73
|
|
75
|
-
def initialize(file, chunk_size=
|
74
|
+
def initialize(file, chunk_size=5)
|
76
75
|
self.original_file = file
|
77
76
|
self.duration = AudioInspector.new(file).duration
|
78
77
|
self.size = chunk_size
|
@@ -86,7 +85,13 @@ module Speech
|
|
86
85
|
#puts "generate: #{full_chunks} chunks of #{size} seconds, last: #{last_chunk} seconds"
|
87
86
|
|
88
87
|
(full_chunks-1).times do|chunkid|
|
89
|
-
|
88
|
+
if chunkid > 0
|
89
|
+
chunks << AudioChunk.new(self, chunkid * self.size, self.size)
|
90
|
+
else
|
91
|
+
off = (chunkid * self.size)-(self.size/2)
|
92
|
+
off = 0 if off < 0
|
93
|
+
chunks << AudioChunk.new(self, off, self.size)
|
94
|
+
end
|
90
95
|
end
|
91
96
|
|
92
97
|
if chunks.empty?
|
@@ -94,7 +99,7 @@ module Speech
|
|
94
99
|
else
|
95
100
|
chunks << AudioChunk.new(self, chunks.last.offset.to_i + chunks.last.duration.to_i, self.size + last_chunk)
|
96
101
|
end
|
97
|
-
puts "Chunk count: #{chunks.size}"
|
102
|
+
#puts "Chunk count: #{chunks.size}"
|
98
103
|
|
99
104
|
chunks
|
100
105
|
end
|
data/lib/speech/audio_to_text.rb
CHANGED
@@ -2,63 +2,80 @@
|
|
2
2
|
module Speech
|
3
3
|
|
4
4
|
class AudioToText
|
5
|
-
attr_accessor :file, :rate, :captured_json
|
5
|
+
attr_accessor :file, :rate, :captured_json
|
6
|
+
attr_accessor :best_match_text, :score, :verbose, :segments
|
6
7
|
|
7
|
-
def initialize(file)
|
8
|
+
def initialize(file, options={})
|
9
|
+
self.verbose = false
|
8
10
|
self.file = file
|
9
|
-
self.captured_file = self.file.gsub(/\.wav$/,'.json')
|
10
11
|
self.captured_json = {}
|
12
|
+
self.best_match_text = ""
|
13
|
+
self.score = 0.0
|
14
|
+
self.segments = 0
|
15
|
+
|
16
|
+
self.verbose = !!options[:verbose] if options.key?(:verbose)
|
17
|
+
end
|
18
|
+
|
19
|
+
def to_text(max=2,lang="en-US")
|
20
|
+
to_json(max,lang)
|
21
|
+
self.best_match_text
|
11
22
|
end
|
12
23
|
|
13
|
-
def
|
14
|
-
|
24
|
+
def to_json(max=2,lang="en-US")
|
25
|
+
self.best_match_text = ""
|
26
|
+
self.score = 0.0
|
27
|
+
self.segments = 0
|
28
|
+
|
29
|
+
url = "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=speech2text&lang=#{lang}&maxresults=#{max}"
|
15
30
|
splitter = Speech::AudioSplitter.new(file) # based off the wave file because flac doesn't tell us the duration
|
16
31
|
easy = Curl::Easy.new(url)
|
17
32
|
splitter.split.each do|chunk|
|
18
33
|
chunk.build.to_flac
|
19
34
|
convert_chunk(easy, chunk)
|
20
35
|
end
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
def clean
|
25
|
-
File.unlink self.captured_file if self.captured_file && File.exist?(self.captured_file)
|
36
|
+
self.best_match_text = self.best_match_text.strip
|
37
|
+
self.score /= self.segments
|
38
|
+
self.captured_json
|
26
39
|
end
|
27
40
|
|
28
41
|
protected
|
29
42
|
|
30
43
|
def convert_chunk(easy, chunk, options={})
|
31
|
-
puts "sending chunk of size #{chunk.duration}..."
|
44
|
+
puts "sending chunk of size #{chunk.duration}..." if self.verbose
|
32
45
|
retrying = true
|
33
46
|
retry_count = 0
|
34
|
-
while retrying && retry_count <
|
35
|
-
|
47
|
+
while retrying && retry_count < 3 # 3 retries
|
48
|
+
easy.verbose = self.verbose
|
36
49
|
easy.headers['Content-Type'] = "audio/x-flac; rate=#{chunk.flac_rate}"
|
37
50
|
easy.headers['User-Agent'] = "https://github.com/taf2/speech2text"
|
38
|
-
#puts chunk.inspect
|
39
51
|
easy.post_body = "Content=#{chunk.to_flac_bytes}"
|
40
|
-
|
52
|
+
if self.verbose
|
53
|
+
easy.on_progress {|dl_total, dl_now, ul_total, ul_now| printf("%.2f/%.2f\r", ul_now, ul_total); true }
|
54
|
+
end
|
41
55
|
easy.on_complete {|easy| puts }
|
42
56
|
easy.http_post
|
43
|
-
#puts easy.header_str
|
44
|
-
#puts easy.body_str
|
45
57
|
if easy.response_code == 500
|
46
|
-
puts "500 from google retry after 0.5 seconds"
|
58
|
+
puts "500 from google retry after 0.5 seconds" if self.verbose
|
47
59
|
retrying = true
|
48
60
|
retry_count += 1
|
49
61
|
sleep 0.5 # wait longer on error?, google??
|
50
62
|
else
|
51
|
-
# {"status":0,"id":"ce178ea89f8b17d8e8298c9c7814700a-1","hypotheses":[{"utterance"
|
63
|
+
# {"status":0,"id":"ce178ea89f8b17d8e8298c9c7814700a-1","hypotheses":[{"utterance"=>"I like pickles", "confidence"=>0.59408695}, {"utterance"=>"I like turtles"}, {"utterance"=>"I like tickles"}, {"utterance"=>"I like to Kohl's"}, {"utterance"=>"I Like tickles"}, {"utterance"=>"I lyk tickles"}, {"utterance"=>"I liked to Kohl's"}]}
|
52
64
|
data = JSON.parse(easy.body_str)
|
53
65
|
self.captured_json['status'] = data['status']
|
54
66
|
self.captured_json['id'] = data['id']
|
55
67
|
self.captured_json['hypotheses'] = data['hypotheses'].map {|ut| [ut['utterance'], ut['confidence']] }
|
56
|
-
|
57
|
-
|
68
|
+
if data.key?('hypotheses') && ['hypotheses'].first
|
69
|
+
self.best_match_text += " " + data['hypotheses'].first['utterance']
|
70
|
+
self.score += data['hypotheses'].first['confidence']
|
71
|
+
self.segments += 1
|
72
|
+
end
|
58
73
|
retrying = false
|
59
74
|
end
|
60
75
|
sleep 0.1 # not too fast there tiger
|
61
76
|
end
|
77
|
+
puts "#{segments} processed: #{self.captured_json.inspect}" if self.verbose
|
78
|
+
self.captured_json
|
62
79
|
ensure
|
63
80
|
chunk.clean
|
64
81
|
end
|
data/lib/speech/version.rb
CHANGED
data/test/audio_splitter_test.rb
CHANGED
@@ -6,10 +6,10 @@ require 'speech'
|
|
6
6
|
class SpeechAudioSplitterTest < Test::Unit::TestCase
|
7
7
|
|
8
8
|
def test_audio_splitter
|
9
|
-
splitter = Speech::AudioSplitter.new("samples/i-like-pickles.wav", 1)
|
9
|
+
splitter = Speech::AudioSplitter.new(File.expand_path(File.join(File.dirname(__FILE__),"samples/i-like-pickles.wav")), 1)
|
10
10
|
|
11
|
-
assert_equal '00:00:03
|
12
|
-
assert_equal 3.
|
11
|
+
assert_equal '00:00:03.51', splitter.duration.to_s
|
12
|
+
assert_equal 3.51, splitter.duration.to_f
|
13
13
|
|
14
14
|
chunks = splitter.split
|
15
15
|
assert_equal 3, chunks.size
|
data/test/audio_to_text_test.rb
CHANGED
@@ -4,39 +4,19 @@ $:.unshift File.expand_path(File.dirname(__FILE__) + '/../lib')
|
|
4
4
|
require 'speech'
|
5
5
|
|
6
6
|
class SpeechAudioToTextTest < Test::Unit::TestCase
|
7
|
-
def
|
8
|
-
|
9
|
-
|
10
|
-
assert captured_json
|
11
|
-
assert captured_json.key?("hypotheses")
|
12
|
-
assert !captured_json['hypotheses'].empty?
|
13
|
-
assert captured_json.keys.include?('status')
|
14
|
-
assert captured_json.keys.include?('id')
|
15
|
-
assert captured_json.keys.include?('hypotheses')
|
7
|
+
def setup
|
8
|
+
super
|
9
|
+
end
|
16
10
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
# puts captured_json.inspect
|
21
|
-
ensure
|
22
|
-
audio.clean
|
11
|
+
def test_audio_to_text
|
12
|
+
audio = Speech::AudioToText.new(File.expand_path(File.join(File.dirname(__FILE__),"samples/i-like-pickles.wav")))
|
13
|
+
assert_equal "I like pickles", audio.to_text
|
23
14
|
end
|
24
15
|
|
25
|
-
def
|
26
|
-
audio = Speech::AudioToText.new("
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
assert !captured_json['hypotheses'].empty?
|
31
|
-
#{"status"=>0, "id"=>"552de5ba35bb769ce3493ff113e158a8-1", "hypotheses"=>[["eagles", 0.7214844], ["pickles", nil], ["michaels", nil], ["giggles", nil], ["tickles", nil]]}
|
32
|
-
assert captured_json.keys.include?('status')
|
33
|
-
assert captured_json.keys.include?('id')
|
34
|
-
assert captured_json.keys.include?('hypotheses')
|
35
|
-
puts captured_json.inspect
|
36
|
-
assert_equal "eagles", captured_json['hypotheses'][0].first
|
37
|
-
assert_equal "pickles", captured_json['hypotheses'][1].first
|
38
|
-
#assert captured_json['confidence'] > 0.9
|
39
|
-
ensure
|
40
|
-
audio.clean
|
16
|
+
def test_longer_audio
|
17
|
+
audio = Speech::AudioToText.new(File.expand_path(File.join(File.dirname(__FILE__),"/SampleAudio.wav")), :verbose => true)
|
18
|
+
puts audio.to_text
|
19
|
+
puts audio.score
|
20
|
+
puts audio.segments
|
41
21
|
end
|
42
22
|
end
|
metadata
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: speech2text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: !binary |-
|
5
|
+
MC4zLjY=
|
5
6
|
prerelease:
|
6
7
|
platform: ruby
|
7
8
|
authors:
|
@@ -9,12 +10,11 @@ authors:
|
|
9
10
|
autorequire:
|
10
11
|
bindir: bin
|
11
12
|
cert_chain: []
|
12
|
-
date:
|
13
|
-
default_executable:
|
13
|
+
date: 2012-10-07 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: curb
|
17
|
-
requirement:
|
17
|
+
requirement: !ruby/object:Gem::Requirement
|
18
18
|
none: false
|
19
19
|
requirements:
|
20
20
|
- - ! '>='
|
@@ -22,10 +22,15 @@ dependencies:
|
|
22
22
|
version: '0'
|
23
23
|
type: :runtime
|
24
24
|
prerelease: false
|
25
|
-
version_requirements:
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
none: false
|
27
|
+
requirements:
|
28
|
+
- - ! '>='
|
29
|
+
- !ruby/object:Gem::Version
|
30
|
+
version: '0'
|
26
31
|
- !ruby/object:Gem::Dependency
|
27
32
|
name: json
|
28
|
-
requirement:
|
33
|
+
requirement: !ruby/object:Gem::Requirement
|
29
34
|
none: false
|
30
35
|
requirements:
|
31
36
|
- - ! '>='
|
@@ -33,7 +38,12 @@ dependencies:
|
|
33
38
|
version: '0'
|
34
39
|
type: :runtime
|
35
40
|
prerelease: false
|
36
|
-
version_requirements:
|
41
|
+
version_requirements: !ruby/object:Gem::Requirement
|
42
|
+
none: false
|
43
|
+
requirements:
|
44
|
+
- - ! '>='
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '0'
|
37
47
|
description: Super powers of Google wrapped in a nice Ruby interface
|
38
48
|
email: todd.fisher@gmail.com
|
39
49
|
executables:
|
@@ -51,12 +61,10 @@ files:
|
|
51
61
|
- test/audio_splitter_test.rb
|
52
62
|
- test/audio_to_text_test.rb
|
53
63
|
- test/SampleAudio.wav
|
54
|
-
- test/samples/i-like-pickles.json
|
55
64
|
- test/samples/i-like-pickles.wav
|
56
65
|
- Rakefile
|
57
66
|
- README.rdoc
|
58
67
|
- speech2text.gemspec
|
59
|
-
has_rdoc: true
|
60
68
|
homepage: https://github.com/taf2/speech2text
|
61
69
|
licenses: []
|
62
70
|
post_install_message:
|
@@ -77,8 +85,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
77
85
|
version: '0'
|
78
86
|
requirements: []
|
79
87
|
rubyforge_project:
|
80
|
-
rubygems_version: 1.
|
88
|
+
rubygems_version: 1.8.24
|
81
89
|
signing_key:
|
82
90
|
specification_version: 3
|
83
91
|
summary: Speech to Text Library
|
84
92
|
test_files: []
|
93
|
+
has_rdoc:
|
@@ -1 +0,0 @@
|
|
1
|
-
{"captured_json":[["I like pickles",0.92731786]],"confidence":0.92731786}
|