speech2text 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +17 -0
- data/Rakefile +9 -0
- data/bin/speech2text +7 -0
- data/lib/speech.rb +9 -0
- data/lib/speech/audio_inspector.rb +43 -0
- data/lib/speech/audio_splitter.rb +87 -0
- data/lib/speech/audio_to_text.rb +73 -0
- data/lib/speech/version.rb +6 -0
- data/speech2text.gemspec +17 -0
- data/test/SampleAudio.wav +0 -0
- data/test/audio_inspector_test.rb +9 -0
- data/test/audio_splitter_test.rb +26 -0
- data/test/audio_to_text_test.rb +21 -0
- data/test/i-like-pickles.wav +0 -0
- data/test/samples/i-like-pickles.wav +0 -0
- metadata +84 -0
data/README.rdoc
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
== Speech2Text
|
2
|
+
|
3
|
+
Using the power of ffmpeg/flac/Google and ruby here is a simple interface to play with to convert speech to text.
|
4
|
+
|
5
|
+
Using a new undocumentd speech API from Google with the help of this article: http://mikepultz.com/2011/03/accessing-google-speech-api-chrome-11/
|
6
|
+
|
7
|
+
We're able to provide a very simple API in Ruby to decode simple audio to text.
|
8
|
+
|
9
|
+
The API from Google is not yet public and so may change. It also seems to be very fragile as more times than not it will return a 500, so the library has retry code built in - for larger audio files 10+ failures may return before a successful result is retrieved...
|
10
|
+
|
11
|
+
It also appears that the API only likes smaller audio files so there is a built in chunker that allows us to split the audio up into smaller chunks.
|
12
|
+
|
13
|
+
== Example
|
14
|
+
|
15
|
+
audio = Speech::AudioToText.new("i-like-pickles.wav")
|
16
|
+
puts audio.to_text.inspect
|
17
|
+
=> {"captured_json"=>[["I like pickles", 0.92731786]], "confidence"=>0.92731786}
|
data/Rakefile
ADDED
data/bin/speech2text
ADDED
data/lib/speech.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
# -*- encoding: binary -*-
|
2
|
+
module Speech
|
3
|
+
|
4
|
+
class AudioInspector
|
5
|
+
attr_accessor :duration
|
6
|
+
|
7
|
+
class Duration
|
8
|
+
attr_accessor :hours, :minutes, :seconds, :total_seconds
|
9
|
+
|
10
|
+
def initialize(duration_str)
|
11
|
+
self.hours, self.minutes, self.seconds = duration_str.split(':')
|
12
|
+
self.total_seconds = (self.hours.to_i * 3600) + (self.minutes.to_i * 60) + self.seconds.to_f
|
13
|
+
end
|
14
|
+
|
15
|
+
def to_s
|
16
|
+
s,f = seconds.split('.')
|
17
|
+
sprintf "%.2d:%.2d:%.2d:%.2d", self.hours, self.minutes, s, (f||0)
|
18
|
+
#"#{hours}:#{minutes}:#{seconds}:#{f}"
|
19
|
+
end
|
20
|
+
|
21
|
+
def to_f
|
22
|
+
self.total_seconds
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.from_seconds(seconds)
|
26
|
+
duration = Duration.new("00:00:00.00")
|
27
|
+
duration.hours = (seconds.to_i / 3600).to_i
|
28
|
+
duration.minutes = (seconds / 60).to_i
|
29
|
+
duration.seconds = (seconds - (duration.minutes*60) - (duration.hours*3600)).to_s
|
30
|
+
duration.hours = duration.hours.to_s
|
31
|
+
duration.minutes = duration.minutes.to_s
|
32
|
+
|
33
|
+
duration
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
def initialize(file)
|
39
|
+
self.duration = Duration.new(`ffmpeg -i #{file} 2>&1`.strip.scan(/Duration: (.*),/).first.first)
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
# -*- encoding: binary -*-
|
2
|
+
module Speech
|
3
|
+
|
4
|
+
class AudioSplitter
|
5
|
+
attr_accessor :original_file, :size, :duration, :chunks
|
6
|
+
|
7
|
+
class AudioChunk
|
8
|
+
attr_accessor :splitter, :chunk, :flac_chunk, :offset, :duration, :flac_rate
|
9
|
+
|
10
|
+
def initialize(splitter, offset, duration)
|
11
|
+
self.offset = offset
|
12
|
+
self.chunk = "chunk-" + splitter.original_file.gsub(/\.(.*)$/, "-#{offset}" + '.\1')
|
13
|
+
self.duration = duration
|
14
|
+
self.splitter = splitter
|
15
|
+
end
|
16
|
+
|
17
|
+
# given the original file from the splitter and the chunked file name with duration and offset run the ffmpeg command
|
18
|
+
def build
|
19
|
+
# ffmpeg -y -i sample.audio.wav -acodec copy -vcodec copy -ss 00:00:00:00 -t 00:00:30:00 sample.audio.out.wav
|
20
|
+
offset_ts = AudioInspector::Duration.from_seconds(self.offset)
|
21
|
+
duration_ts = AudioInspector::Duration.from_seconds(self.duration)
|
22
|
+
#puts "offset: #{ offset_ts.to_s }, duration: #{duration_ts.to_s}"
|
23
|
+
cmd = "ffmpeg -y -i #{splitter.original_file} -acodec copy -vcodec copy -ss #{offset_ts} -t #{duration_ts} #{self.chunk} >/dev/null 2>&1"
|
24
|
+
if system(cmd)
|
25
|
+
self
|
26
|
+
else
|
27
|
+
raise "Failed to generate chunk at offset: #{offset_ts}, duration: #{duration_ts}\n#{cmd}"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# convert the audio file to flac format
|
32
|
+
def to_flac
|
33
|
+
if system("flac #{chunk} >/dev/null 2>&1")
|
34
|
+
self.flac_chunk = chunk.gsub(File.extname(chunk), ".flac")
|
35
|
+
# convert the audio file to 16K
|
36
|
+
self.flac_rate = `ffmpeg -i #{self.flac_chunk} 2>&1`.strip.scan(/Audio: flac, (.*) Hz/).first.first.strip
|
37
|
+
down_sampled = self.flac_chunk.gsub(/\.flac$/, '-sampled.flac')
|
38
|
+
if system("ffmpeg -i #{self.flac_chunk} -ar 16000 -y #{down_sampled} >/dev/null 2>&1")
|
39
|
+
system("mv #{down_sampled} #{self.flac_chunk} 2>&1 >/dev/null")
|
40
|
+
self.flac_rate = 16000
|
41
|
+
else
|
42
|
+
raise "failed to convert to lower audio rate"
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def to_flac_bytes
|
49
|
+
File.read(self.flac_chunk)
|
50
|
+
end
|
51
|
+
|
52
|
+
# delete the chunk file
|
53
|
+
def clean
|
54
|
+
File.unlink self.chunk if File.exist?(self.chunk)
|
55
|
+
File.unlink self.flac_chunk if self.flac_chunk && File.exist?(self.flac_chunk)
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
def initialize(file, chunk_size=30)
|
61
|
+
self.original_file = file
|
62
|
+
self.duration = AudioInspector.new(file).duration
|
63
|
+
self.size = chunk_size
|
64
|
+
self.chunks = []
|
65
|
+
end
|
66
|
+
|
67
|
+
def split
|
68
|
+
# compute the total number of chunks
|
69
|
+
full_chunks = (self.duration.to_f / size).to_i
|
70
|
+
last_chunk = ((self.duration.to_f % size) * 100).round / 100.0
|
71
|
+
#puts "generate: #{full_chunks} chunks of #{size} seconds, last: #{last_chunk} seconds"
|
72
|
+
|
73
|
+
(full_chunks-1).times do|chunkid|
|
74
|
+
chunks << AudioChunk.new(self, chunkid * self.size, self.size)
|
75
|
+
end
|
76
|
+
|
77
|
+
if chunks.empty?
|
78
|
+
chunks << AudioChunk.new(self, 0, self.duration.to_f)
|
79
|
+
else
|
80
|
+
chunks << AudioChunk.new(self, chunks.last.offset.to_i + chunks.last.duration.to_i, self.size + last_chunk)
|
81
|
+
end
|
82
|
+
|
83
|
+
chunks
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
# -*- encoding: binary -*-
|
2
|
+
module Speech
|
3
|
+
|
4
|
+
class AudioToText
|
5
|
+
attr_accessor :file, :rate, :captured_json, :confidence, :captured_file
|
6
|
+
|
7
|
+
def initialize(file)
|
8
|
+
self.file = file
|
9
|
+
self.captured_file = self.file.gsub(/\.wav$/,'.json')
|
10
|
+
self.captured_json = []
|
11
|
+
self.confidence = 0.0
|
12
|
+
end
|
13
|
+
|
14
|
+
def to_text
|
15
|
+
url = "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=speech2text&lang=en-US"
|
16
|
+
splitter = Speech::AudioSplitter.new(file) # based off the wave file because flac doesn't tell us the duration
|
17
|
+
easy = Curl::Easy.new(url)
|
18
|
+
splitter.split.each do|chunk|
|
19
|
+
chunk.build.to_flac
|
20
|
+
convert_chunk(easy, chunk)
|
21
|
+
end
|
22
|
+
JSON.parse(File.read(self.captured_file))
|
23
|
+
end
|
24
|
+
|
25
|
+
def clean
|
26
|
+
File.unlink self.captured_file if self.captured_file && File.exist?(self.captured_file)
|
27
|
+
end
|
28
|
+
|
29
|
+
protected
|
30
|
+
|
31
|
+
def convert_chunk(easy, chunk, options={})
|
32
|
+
puts "sending chunk of size #{chunk.duration}..."
|
33
|
+
retrying = true
|
34
|
+
while retrying
|
35
|
+
#easy.verbose = true
|
36
|
+
easy.headers['Content-Type'] = "audio/x-flac; rate=#{chunk.flac_rate}"
|
37
|
+
easy.post_body = "Content=#{chunk.to_flac_bytes}"
|
38
|
+
easy.on_progress {|dl_total, dl_now, ul_total, ul_now| printf("%.2f/%.2f\r", ul_now, ul_total); true }
|
39
|
+
easy.on_complete {|easy| puts }
|
40
|
+
easy.http_post
|
41
|
+
#puts easy.header_str
|
42
|
+
#puts easy.body_str
|
43
|
+
if easy.response_code == 500
|
44
|
+
puts "500 from google retry after 0.5 seconds"
|
45
|
+
retrying = true
|
46
|
+
sleep 0.5 # wait longer on error?
|
47
|
+
else
|
48
|
+
# {"status":0,"id":"ce178ea89f8b17d8e8298c9c7814700a-1","hypotheses":[{"utterance":"I like pickles","confidence":0.92731786}]}
|
49
|
+
data = JSON.parse(easy.body_str)
|
50
|
+
data['hypotheses'].each {|utterance|
|
51
|
+
self.captured_json << [utterance['utterance'], utterance['confidence']]
|
52
|
+
self.confidence += utterance['confidence']
|
53
|
+
}
|
54
|
+
File.open("#{self.captured_file}", "wb") {|f|
|
55
|
+
size = self.captured_json.size
|
56
|
+
if size > 0
|
57
|
+
confidence_calc = self.confidence / size
|
58
|
+
else
|
59
|
+
confidence_calc = 0
|
60
|
+
end
|
61
|
+
f << {:captured_json => captured_json, :confidence => confidence_calc}.to_json
|
62
|
+
}
|
63
|
+
retrying = false
|
64
|
+
end
|
65
|
+
sleep 0.1 # not too fast there tiger
|
66
|
+
end
|
67
|
+
ensure
|
68
|
+
chunk.clean
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
data/speech2text.gemspec
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
$:.unshift File.expand_path(File.dirname(__FILE__) + "/lib")
|
2
|
+
require "speech/version"
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = "speech2text"
|
6
|
+
s.authors = ["Todd A. Fisher"]
|
7
|
+
s.email = "todd.fisher@gmail.com"
|
8
|
+
s.version = Speech::Info::VERSION
|
9
|
+
s.homepage = "https://github.com/taf2/speech2text"
|
10
|
+
s.summary = "Speech to Text Library"
|
11
|
+
s.description = "Super powers of Google wrapped in a nice Ruby interface"
|
12
|
+
s.files = Dir["{lib,bin,test}/**/*", "Rakefile", "README.rdoc", "*.gemspec"]
|
13
|
+
s.executables = %w(speech2text)
|
14
|
+
|
15
|
+
s.add_dependency "curb"
|
16
|
+
s.add_dependency "json"
|
17
|
+
end
|
Binary file
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# -*- encoding: binary -*-
|
2
|
+
require 'test/unit'
|
3
|
+
$:.unshift File.expand_path(File.dirname(__FILE__) + '/../lib')
|
4
|
+
require 'speech'
|
5
|
+
|
6
|
+
class SpeechAudioSplitterTest < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def test_audio_splitter
|
9
|
+
splitter = Speech::AudioSplitter.new("i-like-pickles.wav", 1)
|
10
|
+
|
11
|
+
assert_equal '00:00:03:52', splitter.duration.to_s
|
12
|
+
assert_equal 3.52, splitter.duration.to_f
|
13
|
+
|
14
|
+
chunks = splitter.split
|
15
|
+
assert_equal 3, chunks.size
|
16
|
+
chunks.each do|chunk|
|
17
|
+
chunk.build.to_flac
|
18
|
+
assert File.exist? chunk.chunk
|
19
|
+
assert File.exist? chunk.flac_chunk
|
20
|
+
chunk.clean
|
21
|
+
assert !File.exist?(chunk.chunk)
|
22
|
+
assert !File.exist?(chunk.flac_chunk)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# -*- encoding: binary -*-
|
2
|
+
require 'test/unit'
|
3
|
+
$:.unshift File.expand_path(File.dirname(__FILE__) + '/../lib')
|
4
|
+
require 'speech'
|
5
|
+
|
6
|
+
class SpeechAudioToTextTest < Test::Unit::TestCase
|
7
|
+
def test_audio_to_text
|
8
|
+
audio = Speech::AudioToText.new("i-like-pickles.wav")
|
9
|
+
captured_json = audio.to_text
|
10
|
+
assert captured_json
|
11
|
+
assert captured_json.key?("captured_json")
|
12
|
+
assert !captured_json['captured_json'].empty?
|
13
|
+
assert_equal ['captured_json', 'confidence'], captured_json.keys.sort
|
14
|
+
assert_equal "I like pickles", captured_json['captured_json'].flatten.first
|
15
|
+
assert captured_json['confidence'] > 0.9
|
16
|
+
# {"captured_json"=>[["I like pickles", 0.92731786]], "confidence"=>0.92731786}
|
17
|
+
# puts captured_json.inspect
|
18
|
+
ensure
|
19
|
+
audio.clean
|
20
|
+
end
|
21
|
+
end
|
Binary file
|
Binary file
|
metadata
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: speech2text
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.2
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Todd A. Fisher
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-03-24 00:00:00.000000000 -04:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: curb
|
17
|
+
requirement: &2157003720 !ruby/object:Gem::Requirement
|
18
|
+
none: false
|
19
|
+
requirements:
|
20
|
+
- - ! '>='
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: '0'
|
23
|
+
type: :runtime
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: *2157003720
|
26
|
+
- !ruby/object:Gem::Dependency
|
27
|
+
name: json
|
28
|
+
requirement: &2157003280 !ruby/object:Gem::Requirement
|
29
|
+
none: false
|
30
|
+
requirements:
|
31
|
+
- - ! '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: *2157003280
|
37
|
+
description: Super powers of Google wrapped in a nice Ruby interface
|
38
|
+
email: todd.fisher@gmail.com
|
39
|
+
executables:
|
40
|
+
- speech2text
|
41
|
+
extensions: []
|
42
|
+
extra_rdoc_files: []
|
43
|
+
files:
|
44
|
+
- lib/speech/audio_inspector.rb
|
45
|
+
- lib/speech/audio_splitter.rb
|
46
|
+
- lib/speech/audio_to_text.rb
|
47
|
+
- lib/speech/version.rb
|
48
|
+
- lib/speech.rb
|
49
|
+
- bin/speech2text
|
50
|
+
- test/audio_inspector_test.rb
|
51
|
+
- test/audio_splitter_test.rb
|
52
|
+
- test/audio_to_text_test.rb
|
53
|
+
- test/i-like-pickles.wav
|
54
|
+
- test/SampleAudio.wav
|
55
|
+
- test/samples/i-like-pickles.wav
|
56
|
+
- Rakefile
|
57
|
+
- README.rdoc
|
58
|
+
- speech2text.gemspec
|
59
|
+
has_rdoc: true
|
60
|
+
homepage: https://github.com/taf2/speech2text
|
61
|
+
licenses: []
|
62
|
+
post_install_message:
|
63
|
+
rdoc_options: []
|
64
|
+
require_paths:
|
65
|
+
- lib
|
66
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
67
|
+
none: false
|
68
|
+
requirements:
|
69
|
+
- - ! '>='
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: '0'
|
72
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
requirements: []
|
79
|
+
rubyforge_project:
|
80
|
+
rubygems_version: 1.6.2
|
81
|
+
signing_key:
|
82
|
+
specification_version: 3
|
83
|
+
summary: Speech to Text Library
|
84
|
+
test_files: []
|