speech2text 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc ADDED
@@ -0,0 +1,17 @@
1
+ == Speech2Text
2
+
3
+ Using the power of ffmpeg/flac/Google and ruby here is a simple interface to play with to convert speech to text.
4
+
5
+ Using a new undocumentd speech API from Google with the help of this article: http://mikepultz.com/2011/03/accessing-google-speech-api-chrome-11/
6
+
7
+ We're able to provide a very simple API in Ruby to decode simple audio to text.
8
+
9
+ The API from Google is not yet public and so may change. It also seems to be very fragile as more times than not it will return a 500, so the library has retry code built in - for larger audio files 10+ failures may return before a successful result is retrieved...
10
+
11
+ It also appears that the API only likes smaller audio files so there is a built in chunker that allows us to split the audio up into smaller chunks.
12
+
13
+ == Example
14
+
15
+ audio = Speech::AudioToText.new("i-like-pickles.wav")
16
+ puts audio.to_text.inspect
17
+ => {"captured_json"=>[["I like pickles", 0.92731786]], "confidence"=>0.92731786}
data/Rakefile ADDED
@@ -0,0 +1,9 @@
1
+ require 'rake/testtask'
2
+
3
+ desc "Default Task (Test project)"
4
+ task :default => :test
5
+
6
+ Rake::TestTask.new(:test) do |t|
7
+ t.test_files = FileList['test/*_test.rb']
8
+ t.verbose = false
9
+ end
data/bin/speech2text ADDED
@@ -0,0 +1,7 @@
1
+ #!/this/will/be/replaced/by/rubygems
2
+ # -*- encoding: binary -*-
3
+
4
+ require 'speech'
5
+
6
+ captured_json = Speech::AudioToText.new(ARGV[0]).to_text
7
+ puts captured_json.inspect
data/lib/speech.rb ADDED
@@ -0,0 +1,9 @@
1
+ # -*- encoding: binary -*-
2
+ require 'curb'
3
+ require 'json'
4
+
5
+ module Speech; end
6
+
7
+ require 'speech/audio_inspector'
8
+ require 'speech/audio_splitter'
9
+ require 'speech/audio_to_text'
@@ -0,0 +1,43 @@
1
+ # -*- encoding: binary -*-
2
+ module Speech
3
+
4
+ class AudioInspector
5
+ attr_accessor :duration
6
+
7
+ class Duration
8
+ attr_accessor :hours, :minutes, :seconds, :total_seconds
9
+
10
+ def initialize(duration_str)
11
+ self.hours, self.minutes, self.seconds = duration_str.split(':')
12
+ self.total_seconds = (self.hours.to_i * 3600) + (self.minutes.to_i * 60) + self.seconds.to_f
13
+ end
14
+
15
+ def to_s
16
+ s,f = seconds.split('.')
17
+ sprintf "%.2d:%.2d:%.2d:%.2d", self.hours, self.minutes, s, (f||0)
18
+ #"#{hours}:#{minutes}:#{seconds}:#{f}"
19
+ end
20
+
21
+ def to_f
22
+ self.total_seconds
23
+ end
24
+
25
+ def self.from_seconds(seconds)
26
+ duration = Duration.new("00:00:00.00")
27
+ duration.hours = (seconds.to_i / 3600).to_i
28
+ duration.minutes = (seconds / 60).to_i
29
+ duration.seconds = (seconds - (duration.minutes*60) - (duration.hours*3600)).to_s
30
+ duration.hours = duration.hours.to_s
31
+ duration.minutes = duration.minutes.to_s
32
+
33
+ duration
34
+ end
35
+
36
+ end
37
+
38
+ def initialize(file)
39
+ self.duration = Duration.new(`ffmpeg -i #{file} 2>&1`.strip.scan(/Duration: (.*),/).first.first)
40
+ end
41
+
42
+ end
43
+ end
@@ -0,0 +1,87 @@
1
+ # -*- encoding: binary -*-
2
+ module Speech
3
+
4
+ class AudioSplitter
5
+ attr_accessor :original_file, :size, :duration, :chunks
6
+
7
+ class AudioChunk
8
+ attr_accessor :splitter, :chunk, :flac_chunk, :offset, :duration, :flac_rate
9
+
10
+ def initialize(splitter, offset, duration)
11
+ self.offset = offset
12
+ self.chunk = "chunk-" + splitter.original_file.gsub(/\.(.*)$/, "-#{offset}" + '.\1')
13
+ self.duration = duration
14
+ self.splitter = splitter
15
+ end
16
+
17
+ # given the original file from the splitter and the chunked file name with duration and offset run the ffmpeg command
18
+ def build
19
+ # ffmpeg -y -i sample.audio.wav -acodec copy -vcodec copy -ss 00:00:00:00 -t 00:00:30:00 sample.audio.out.wav
20
+ offset_ts = AudioInspector::Duration.from_seconds(self.offset)
21
+ duration_ts = AudioInspector::Duration.from_seconds(self.duration)
22
+ #puts "offset: #{ offset_ts.to_s }, duration: #{duration_ts.to_s}"
23
+ cmd = "ffmpeg -y -i #{splitter.original_file} -acodec copy -vcodec copy -ss #{offset_ts} -t #{duration_ts} #{self.chunk} >/dev/null 2>&1"
24
+ if system(cmd)
25
+ self
26
+ else
27
+ raise "Failed to generate chunk at offset: #{offset_ts}, duration: #{duration_ts}\n#{cmd}"
28
+ end
29
+ end
30
+
31
+ # convert the audio file to flac format
32
+ def to_flac
33
+ if system("flac #{chunk} >/dev/null 2>&1")
34
+ self.flac_chunk = chunk.gsub(File.extname(chunk), ".flac")
35
+ # convert the audio file to 16K
36
+ self.flac_rate = `ffmpeg -i #{self.flac_chunk} 2>&1`.strip.scan(/Audio: flac, (.*) Hz/).first.first.strip
37
+ down_sampled = self.flac_chunk.gsub(/\.flac$/, '-sampled.flac')
38
+ if system("ffmpeg -i #{self.flac_chunk} -ar 16000 -y #{down_sampled} >/dev/null 2>&1")
39
+ system("mv #{down_sampled} #{self.flac_chunk} 2>&1 >/dev/null")
40
+ self.flac_rate = 16000
41
+ else
42
+ raise "failed to convert to lower audio rate"
43
+ end
44
+
45
+ end
46
+ end
47
+
48
+ def to_flac_bytes
49
+ File.read(self.flac_chunk)
50
+ end
51
+
52
+ # delete the chunk file
53
+ def clean
54
+ File.unlink self.chunk if File.exist?(self.chunk)
55
+ File.unlink self.flac_chunk if self.flac_chunk && File.exist?(self.flac_chunk)
56
+ end
57
+
58
+ end
59
+
60
+ def initialize(file, chunk_size=30)
61
+ self.original_file = file
62
+ self.duration = AudioInspector.new(file).duration
63
+ self.size = chunk_size
64
+ self.chunks = []
65
+ end
66
+
67
+ def split
68
+ # compute the total number of chunks
69
+ full_chunks = (self.duration.to_f / size).to_i
70
+ last_chunk = ((self.duration.to_f % size) * 100).round / 100.0
71
+ #puts "generate: #{full_chunks} chunks of #{size} seconds, last: #{last_chunk} seconds"
72
+
73
+ (full_chunks-1).times do|chunkid|
74
+ chunks << AudioChunk.new(self, chunkid * self.size, self.size)
75
+ end
76
+
77
+ if chunks.empty?
78
+ chunks << AudioChunk.new(self, 0, self.duration.to_f)
79
+ else
80
+ chunks << AudioChunk.new(self, chunks.last.offset.to_i + chunks.last.duration.to_i, self.size + last_chunk)
81
+ end
82
+
83
+ chunks
84
+ end
85
+
86
+ end
87
+ end
@@ -0,0 +1,73 @@
1
+ # -*- encoding: binary -*-
2
+ module Speech
3
+
4
+ class AudioToText
5
+ attr_accessor :file, :rate, :captured_json, :confidence, :captured_file
6
+
7
+ def initialize(file)
8
+ self.file = file
9
+ self.captured_file = self.file.gsub(/\.wav$/,'.json')
10
+ self.captured_json = []
11
+ self.confidence = 0.0
12
+ end
13
+
14
+ def to_text
15
+ url = "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=speech2text&lang=en-US"
16
+ splitter = Speech::AudioSplitter.new(file) # based off the wave file because flac doesn't tell us the duration
17
+ easy = Curl::Easy.new(url)
18
+ splitter.split.each do|chunk|
19
+ chunk.build.to_flac
20
+ convert_chunk(easy, chunk)
21
+ end
22
+ JSON.parse(File.read(self.captured_file))
23
+ end
24
+
25
+ def clean
26
+ File.unlink self.captured_file if self.captured_file && File.exist?(self.captured_file)
27
+ end
28
+
29
+ protected
30
+
31
+ def convert_chunk(easy, chunk, options={})
32
+ puts "sending chunk of size #{chunk.duration}..."
33
+ retrying = true
34
+ while retrying
35
+ #easy.verbose = true
36
+ easy.headers['Content-Type'] = "audio/x-flac; rate=#{chunk.flac_rate}"
37
+ easy.post_body = "Content=#{chunk.to_flac_bytes}"
38
+ easy.on_progress {|dl_total, dl_now, ul_total, ul_now| printf("%.2f/%.2f\r", ul_now, ul_total); true }
39
+ easy.on_complete {|easy| puts }
40
+ easy.http_post
41
+ #puts easy.header_str
42
+ #puts easy.body_str
43
+ if easy.response_code == 500
44
+ puts "500 from google retry after 0.5 seconds"
45
+ retrying = true
46
+ sleep 0.5 # wait longer on error?
47
+ else
48
+ # {"status":0,"id":"ce178ea89f8b17d8e8298c9c7814700a-1","hypotheses":[{"utterance":"I like pickles","confidence":0.92731786}]}
49
+ data = JSON.parse(easy.body_str)
50
+ data['hypotheses'].each {|utterance|
51
+ self.captured_json << [utterance['utterance'], utterance['confidence']]
52
+ self.confidence += utterance['confidence']
53
+ }
54
+ File.open("#{self.captured_file}", "wb") {|f|
55
+ size = self.captured_json.size
56
+ if size > 0
57
+ confidence_calc = self.confidence / size
58
+ else
59
+ confidence_calc = 0
60
+ end
61
+ f << {:captured_json => captured_json, :confidence => confidence_calc}.to_json
62
+ }
63
+ retrying = false
64
+ end
65
+ sleep 0.1 # not too fast there tiger
66
+ end
67
+ ensure
68
+ chunk.clean
69
+ end
70
+
71
+ end
72
+
73
+ end
@@ -0,0 +1,6 @@
1
+ # -*- encoding: binary -*-
2
+ module Speech
3
+ class Info
4
+ VERSION='0.0.2'
5
+ end
6
+ end
@@ -0,0 +1,17 @@
1
+ $:.unshift File.expand_path(File.dirname(__FILE__) + "/lib")
2
+ require "speech/version"
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = "speech2text"
6
+ s.authors = ["Todd A. Fisher"]
7
+ s.email = "todd.fisher@gmail.com"
8
+ s.version = Speech::Info::VERSION
9
+ s.homepage = "https://github.com/taf2/speech2text"
10
+ s.summary = "Speech to Text Library"
11
+ s.description = "Super powers of Google wrapped in a nice Ruby interface"
12
+ s.files = Dir["{lib,bin,test}/**/*", "Rakefile", "README.rdoc", "*.gemspec"]
13
+ s.executables = %w(speech2text)
14
+
15
+ s.add_dependency "curb"
16
+ s.add_dependency "json"
17
+ end
Binary file
@@ -0,0 +1,9 @@
1
+ # -*- encoding: binary -*-
2
+ require 'test/unit'
3
+ $:.unshift File.expand_path(File.dirname(__FILE__) + '/../lib')
4
+ require 'speech'
5
+
6
+ class SpeechAudioInspectorTest < Test::Unit::TestCase
7
+ def test_audio_inspector
8
+ end
9
+ end
@@ -0,0 +1,26 @@
1
+ # -*- encoding: binary -*-
2
+ require 'test/unit'
3
+ $:.unshift File.expand_path(File.dirname(__FILE__) + '/../lib')
4
+ require 'speech'
5
+
6
+ class SpeechAudioSplitterTest < Test::Unit::TestCase
7
+
8
+ def test_audio_splitter
9
+ splitter = Speech::AudioSplitter.new("i-like-pickles.wav", 1)
10
+
11
+ assert_equal '00:00:03:52', splitter.duration.to_s
12
+ assert_equal 3.52, splitter.duration.to_f
13
+
14
+ chunks = splitter.split
15
+ assert_equal 3, chunks.size
16
+ chunks.each do|chunk|
17
+ chunk.build.to_flac
18
+ assert File.exist? chunk.chunk
19
+ assert File.exist? chunk.flac_chunk
20
+ chunk.clean
21
+ assert !File.exist?(chunk.chunk)
22
+ assert !File.exist?(chunk.flac_chunk)
23
+ end
24
+ end
25
+
26
+ end
@@ -0,0 +1,21 @@
1
+ # -*- encoding: binary -*-
2
+ require 'test/unit'
3
+ $:.unshift File.expand_path(File.dirname(__FILE__) + '/../lib')
4
+ require 'speech'
5
+
6
+ class SpeechAudioToTextTest < Test::Unit::TestCase
7
+ def test_audio_to_text
8
+ audio = Speech::AudioToText.new("i-like-pickles.wav")
9
+ captured_json = audio.to_text
10
+ assert captured_json
11
+ assert captured_json.key?("captured_json")
12
+ assert !captured_json['captured_json'].empty?
13
+ assert_equal ['captured_json', 'confidence'], captured_json.keys.sort
14
+ assert_equal "I like pickles", captured_json['captured_json'].flatten.first
15
+ assert captured_json['confidence'] > 0.9
16
+ # {"captured_json"=>[["I like pickles", 0.92731786]], "confidence"=>0.92731786}
17
+ # puts captured_json.inspect
18
+ ensure
19
+ audio.clean
20
+ end
21
+ end
Binary file
Binary file
metadata ADDED
@@ -0,0 +1,84 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: speech2text
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Todd A. Fisher
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-03-24 00:00:00.000000000 -04:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: curb
17
+ requirement: &2157003720 !ruby/object:Gem::Requirement
18
+ none: false
19
+ requirements:
20
+ - - ! '>='
21
+ - !ruby/object:Gem::Version
22
+ version: '0'
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: *2157003720
26
+ - !ruby/object:Gem::Dependency
27
+ name: json
28
+ requirement: &2157003280 !ruby/object:Gem::Requirement
29
+ none: false
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: *2157003280
37
+ description: Super powers of Google wrapped in a nice Ruby interface
38
+ email: todd.fisher@gmail.com
39
+ executables:
40
+ - speech2text
41
+ extensions: []
42
+ extra_rdoc_files: []
43
+ files:
44
+ - lib/speech/audio_inspector.rb
45
+ - lib/speech/audio_splitter.rb
46
+ - lib/speech/audio_to_text.rb
47
+ - lib/speech/version.rb
48
+ - lib/speech.rb
49
+ - bin/speech2text
50
+ - test/audio_inspector_test.rb
51
+ - test/audio_splitter_test.rb
52
+ - test/audio_to_text_test.rb
53
+ - test/i-like-pickles.wav
54
+ - test/SampleAudio.wav
55
+ - test/samples/i-like-pickles.wav
56
+ - Rakefile
57
+ - README.rdoc
58
+ - speech2text.gemspec
59
+ has_rdoc: true
60
+ homepage: https://github.com/taf2/speech2text
61
+ licenses: []
62
+ post_install_message:
63
+ rdoc_options: []
64
+ require_paths:
65
+ - lib
66
+ required_ruby_version: !ruby/object:Gem::Requirement
67
+ none: false
68
+ requirements:
69
+ - - ! '>='
70
+ - !ruby/object:Gem::Version
71
+ version: '0'
72
+ required_rubygems_version: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ requirements: []
79
+ rubyforge_project:
80
+ rubygems_version: 1.6.2
81
+ signing_key:
82
+ specification_version: 3
83
+ summary: Speech to Text Library
84
+ test_files: []