speech2text 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc ADDED
@@ -0,0 +1,17 @@
1
+ == Speech2Text
2
+
3
+ Using the power of ffmpeg/flac/Google and ruby here is a simple interface to play with to convert speech to text.
4
+
5
+ Using a new undocumentd speech API from Google with the help of this article: http://mikepultz.com/2011/03/accessing-google-speech-api-chrome-11/
6
+
7
+ We're able to provide a very simple API in Ruby to decode simple audio to text.
8
+
9
+ The API from Google is not yet public and so may change. It also seems to be very fragile as more times than not it will return a 500, so the library has retry code built in - for larger audio files 10+ failures may return before a successful result is retrieved...
10
+
11
+ It also appears that the API only likes smaller audio files so there is a built in chunker that allows us to split the audio up into smaller chunks.
12
+
13
+ == Example
14
+
15
+ audio = Speech::AudioToText.new("i-like-pickles.wav")
16
+ puts audio.to_text.inspect
17
+ => {"captured_json"=>[["I like pickles", 0.92731786]], "confidence"=>0.92731786}
data/Rakefile ADDED
@@ -0,0 +1,9 @@
1
+ require 'rake/testtask'
2
+
3
+ desc "Default Task (Test project)"
4
+ task :default => :test
5
+
6
+ Rake::TestTask.new(:test) do |t|
7
+ t.test_files = FileList['test/*_test.rb']
8
+ t.verbose = false
9
+ end
data/bin/speech2text ADDED
@@ -0,0 +1,7 @@
1
+ #!/this/will/be/replaced/by/rubygems
2
+ # -*- encoding: binary -*-
3
+
4
+ require 'speech'
5
+
6
+ captured_json = Speech::AudioToText.new(ARGV[0]).to_text
7
+ puts captured_json.inspect
data/lib/speech.rb ADDED
@@ -0,0 +1,9 @@
1
+ # -*- encoding: binary -*-
2
+ require 'curb'
3
+ require 'json'
4
+
5
+ module Speech; end
6
+
7
+ require 'speech/audio_inspector'
8
+ require 'speech/audio_splitter'
9
+ require 'speech/audio_to_text'
@@ -0,0 +1,43 @@
1
+ # -*- encoding: binary -*-
2
+ module Speech
3
+
4
+ class AudioInspector
5
+ attr_accessor :duration
6
+
7
+ class Duration
8
+ attr_accessor :hours, :minutes, :seconds, :total_seconds
9
+
10
+ def initialize(duration_str)
11
+ self.hours, self.minutes, self.seconds = duration_str.split(':')
12
+ self.total_seconds = (self.hours.to_i * 3600) + (self.minutes.to_i * 60) + self.seconds.to_f
13
+ end
14
+
15
+ def to_s
16
+ s,f = seconds.split('.')
17
+ sprintf "%.2d:%.2d:%.2d:%.2d", self.hours, self.minutes, s, (f||0)
18
+ #"#{hours}:#{minutes}:#{seconds}:#{f}"
19
+ end
20
+
21
+ def to_f
22
+ self.total_seconds
23
+ end
24
+
25
+ def self.from_seconds(seconds)
26
+ duration = Duration.new("00:00:00.00")
27
+ duration.hours = (seconds.to_i / 3600).to_i
28
+ duration.minutes = (seconds / 60).to_i
29
+ duration.seconds = (seconds - (duration.minutes*60) - (duration.hours*3600)).to_s
30
+ duration.hours = duration.hours.to_s
31
+ duration.minutes = duration.minutes.to_s
32
+
33
+ duration
34
+ end
35
+
36
+ end
37
+
38
+ def initialize(file)
39
+ self.duration = Duration.new(`ffmpeg -i #{file} 2>&1`.strip.scan(/Duration: (.*),/).first.first)
40
+ end
41
+
42
+ end
43
+ end
@@ -0,0 +1,87 @@
1
+ # -*- encoding: binary -*-
2
+ module Speech
3
+
4
+ class AudioSplitter
5
+ attr_accessor :original_file, :size, :duration, :chunks
6
+
7
+ class AudioChunk
8
+ attr_accessor :splitter, :chunk, :flac_chunk, :offset, :duration, :flac_rate
9
+
10
+ def initialize(splitter, offset, duration)
11
+ self.offset = offset
12
+ self.chunk = "chunk-" + splitter.original_file.gsub(/\.(.*)$/, "-#{offset}" + '.\1')
13
+ self.duration = duration
14
+ self.splitter = splitter
15
+ end
16
+
17
+ # given the original file from the splitter and the chunked file name with duration and offset run the ffmpeg command
18
+ def build
19
+ # ffmpeg -y -i sample.audio.wav -acodec copy -vcodec copy -ss 00:00:00:00 -t 00:00:30:00 sample.audio.out.wav
20
+ offset_ts = AudioInspector::Duration.from_seconds(self.offset)
21
+ duration_ts = AudioInspector::Duration.from_seconds(self.duration)
22
+ #puts "offset: #{ offset_ts.to_s }, duration: #{duration_ts.to_s}"
23
+ cmd = "ffmpeg -y -i #{splitter.original_file} -acodec copy -vcodec copy -ss #{offset_ts} -t #{duration_ts} #{self.chunk} >/dev/null 2>&1"
24
+ if system(cmd)
25
+ self
26
+ else
27
+ raise "Failed to generate chunk at offset: #{offset_ts}, duration: #{duration_ts}\n#{cmd}"
28
+ end
29
+ end
30
+
31
+ # convert the audio file to flac format
32
+ def to_flac
33
+ if system("flac #{chunk} >/dev/null 2>&1")
34
+ self.flac_chunk = chunk.gsub(File.extname(chunk), ".flac")
35
+ # convert the audio file to 16K
36
+ self.flac_rate = `ffmpeg -i #{self.flac_chunk} 2>&1`.strip.scan(/Audio: flac, (.*) Hz/).first.first.strip
37
+ down_sampled = self.flac_chunk.gsub(/\.flac$/, '-sampled.flac')
38
+ if system("ffmpeg -i #{self.flac_chunk} -ar 16000 -y #{down_sampled} >/dev/null 2>&1")
39
+ system("mv #{down_sampled} #{self.flac_chunk} 2>&1 >/dev/null")
40
+ self.flac_rate = 16000
41
+ else
42
+ raise "failed to convert to lower audio rate"
43
+ end
44
+
45
+ end
46
+ end
47
+
48
+ def to_flac_bytes
49
+ File.read(self.flac_chunk)
50
+ end
51
+
52
+ # delete the chunk file
53
+ def clean
54
+ File.unlink self.chunk if File.exist?(self.chunk)
55
+ File.unlink self.flac_chunk if self.flac_chunk && File.exist?(self.flac_chunk)
56
+ end
57
+
58
+ end
59
+
60
+ def initialize(file, chunk_size=30)
61
+ self.original_file = file
62
+ self.duration = AudioInspector.new(file).duration
63
+ self.size = chunk_size
64
+ self.chunks = []
65
+ end
66
+
67
+ def split
68
+ # compute the total number of chunks
69
+ full_chunks = (self.duration.to_f / size).to_i
70
+ last_chunk = ((self.duration.to_f % size) * 100).round / 100.0
71
+ #puts "generate: #{full_chunks} chunks of #{size} seconds, last: #{last_chunk} seconds"
72
+
73
+ (full_chunks-1).times do|chunkid|
74
+ chunks << AudioChunk.new(self, chunkid * self.size, self.size)
75
+ end
76
+
77
+ if chunks.empty?
78
+ chunks << AudioChunk.new(self, 0, self.duration.to_f)
79
+ else
80
+ chunks << AudioChunk.new(self, chunks.last.offset.to_i + chunks.last.duration.to_i, self.size + last_chunk)
81
+ end
82
+
83
+ chunks
84
+ end
85
+
86
+ end
87
+ end
@@ -0,0 +1,73 @@
1
+ # -*- encoding: binary -*-
2
+ module Speech
3
+
4
+ class AudioToText
5
+ attr_accessor :file, :rate, :captured_json, :confidence, :captured_file
6
+
7
+ def initialize(file)
8
+ self.file = file
9
+ self.captured_file = self.file.gsub(/\.wav$/,'.json')
10
+ self.captured_json = []
11
+ self.confidence = 0.0
12
+ end
13
+
14
+ def to_text
15
+ url = "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=speech2text&lang=en-US"
16
+ splitter = Speech::AudioSplitter.new(file) # based off the wave file because flac doesn't tell us the duration
17
+ easy = Curl::Easy.new(url)
18
+ splitter.split.each do|chunk|
19
+ chunk.build.to_flac
20
+ convert_chunk(easy, chunk)
21
+ end
22
+ JSON.parse(File.read(self.captured_file))
23
+ end
24
+
25
+ def clean
26
+ File.unlink self.captured_file if self.captured_file && File.exist?(self.captured_file)
27
+ end
28
+
29
+ protected
30
+
31
+ def convert_chunk(easy, chunk, options={})
32
+ puts "sending chunk of size #{chunk.duration}..."
33
+ retrying = true
34
+ while retrying
35
+ #easy.verbose = true
36
+ easy.headers['Content-Type'] = "audio/x-flac; rate=#{chunk.flac_rate}"
37
+ easy.post_body = "Content=#{chunk.to_flac_bytes}"
38
+ easy.on_progress {|dl_total, dl_now, ul_total, ul_now| printf("%.2f/%.2f\r", ul_now, ul_total); true }
39
+ easy.on_complete {|easy| puts }
40
+ easy.http_post
41
+ #puts easy.header_str
42
+ #puts easy.body_str
43
+ if easy.response_code == 500
44
+ puts "500 from google retry after 0.5 seconds"
45
+ retrying = true
46
+ sleep 0.5 # wait longer on error?
47
+ else
48
+ # {"status":0,"id":"ce178ea89f8b17d8e8298c9c7814700a-1","hypotheses":[{"utterance":"I like pickles","confidence":0.92731786}]}
49
+ data = JSON.parse(easy.body_str)
50
+ data['hypotheses'].each {|utterance|
51
+ self.captured_json << [utterance['utterance'], utterance['confidence']]
52
+ self.confidence += utterance['confidence']
53
+ }
54
+ File.open("#{self.captured_file}", "wb") {|f|
55
+ size = self.captured_json.size
56
+ if size > 0
57
+ confidence_calc = self.confidence / size
58
+ else
59
+ confidence_calc = 0
60
+ end
61
+ f << {:captured_json => captured_json, :confidence => confidence_calc}.to_json
62
+ }
63
+ retrying = false
64
+ end
65
+ sleep 0.1 # not too fast there tiger
66
+ end
67
+ ensure
68
+ chunk.clean
69
+ end
70
+
71
+ end
72
+
73
+ end
@@ -0,0 +1,6 @@
1
+ # -*- encoding: binary -*-
2
+ module Speech
3
+ class Info
4
+ VERSION='0.0.2'
5
+ end
6
+ end
@@ -0,0 +1,17 @@
1
+ $:.unshift File.expand_path(File.dirname(__FILE__) + "/lib")
2
+ require "speech/version"
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = "speech2text"
6
+ s.authors = ["Todd A. Fisher"]
7
+ s.email = "todd.fisher@gmail.com"
8
+ s.version = Speech::Info::VERSION
9
+ s.homepage = "https://github.com/taf2/speech2text"
10
+ s.summary = "Speech to Text Library"
11
+ s.description = "Super powers of Google wrapped in a nice Ruby interface"
12
+ s.files = Dir["{lib,bin,test}/**/*", "Rakefile", "README.rdoc", "*.gemspec"]
13
+ s.executables = %w(speech2text)
14
+
15
+ s.add_dependency "curb"
16
+ s.add_dependency "json"
17
+ end
Binary file
@@ -0,0 +1,9 @@
1
+ # -*- encoding: binary -*-
2
+ require 'test/unit'
3
+ $:.unshift File.expand_path(File.dirname(__FILE__) + '/../lib')
4
+ require 'speech'
5
+
6
+ class SpeechAudioInspectorTest < Test::Unit::TestCase
7
+ def test_audio_inspector
8
+ end
9
+ end
@@ -0,0 +1,26 @@
1
+ # -*- encoding: binary -*-
2
+ require 'test/unit'
3
+ $:.unshift File.expand_path(File.dirname(__FILE__) + '/../lib')
4
+ require 'speech'
5
+
6
+ class SpeechAudioSplitterTest < Test::Unit::TestCase
7
+
8
+ def test_audio_splitter
9
+ splitter = Speech::AudioSplitter.new("i-like-pickles.wav", 1)
10
+
11
+ assert_equal '00:00:03:52', splitter.duration.to_s
12
+ assert_equal 3.52, splitter.duration.to_f
13
+
14
+ chunks = splitter.split
15
+ assert_equal 3, chunks.size
16
+ chunks.each do|chunk|
17
+ chunk.build.to_flac
18
+ assert File.exist? chunk.chunk
19
+ assert File.exist? chunk.flac_chunk
20
+ chunk.clean
21
+ assert !File.exist?(chunk.chunk)
22
+ assert !File.exist?(chunk.flac_chunk)
23
+ end
24
+ end
25
+
26
+ end
@@ -0,0 +1,21 @@
1
+ # -*- encoding: binary -*-
2
+ require 'test/unit'
3
+ $:.unshift File.expand_path(File.dirname(__FILE__) + '/../lib')
4
+ require 'speech'
5
+
6
+ class SpeechAudioToTextTest < Test::Unit::TestCase
7
+ def test_audio_to_text
8
+ audio = Speech::AudioToText.new("i-like-pickles.wav")
9
+ captured_json = audio.to_text
10
+ assert captured_json
11
+ assert captured_json.key?("captured_json")
12
+ assert !captured_json['captured_json'].empty?
13
+ assert_equal ['captured_json', 'confidence'], captured_json.keys.sort
14
+ assert_equal "I like pickles", captured_json['captured_json'].flatten.first
15
+ assert captured_json['confidence'] > 0.9
16
+ # {"captured_json"=>[["I like pickles", 0.92731786]], "confidence"=>0.92731786}
17
+ # puts captured_json.inspect
18
+ ensure
19
+ audio.clean
20
+ end
21
+ end
Binary file
Binary file
metadata ADDED
@@ -0,0 +1,84 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: speech2text
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Todd A. Fisher
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-03-24 00:00:00.000000000 -04:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: curb
17
+ requirement: &2157003720 !ruby/object:Gem::Requirement
18
+ none: false
19
+ requirements:
20
+ - - ! '>='
21
+ - !ruby/object:Gem::Version
22
+ version: '0'
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: *2157003720
26
+ - !ruby/object:Gem::Dependency
27
+ name: json
28
+ requirement: &2157003280 !ruby/object:Gem::Requirement
29
+ none: false
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: *2157003280
37
+ description: Super powers of Google wrapped in a nice Ruby interface
38
+ email: todd.fisher@gmail.com
39
+ executables:
40
+ - speech2text
41
+ extensions: []
42
+ extra_rdoc_files: []
43
+ files:
44
+ - lib/speech/audio_inspector.rb
45
+ - lib/speech/audio_splitter.rb
46
+ - lib/speech/audio_to_text.rb
47
+ - lib/speech/version.rb
48
+ - lib/speech.rb
49
+ - bin/speech2text
50
+ - test/audio_inspector_test.rb
51
+ - test/audio_splitter_test.rb
52
+ - test/audio_to_text_test.rb
53
+ - test/i-like-pickles.wav
54
+ - test/SampleAudio.wav
55
+ - test/samples/i-like-pickles.wav
56
+ - Rakefile
57
+ - README.rdoc
58
+ - speech2text.gemspec
59
+ has_rdoc: true
60
+ homepage: https://github.com/taf2/speech2text
61
+ licenses: []
62
+ post_install_message:
63
+ rdoc_options: []
64
+ require_paths:
65
+ - lib
66
+ required_ruby_version: !ruby/object:Gem::Requirement
67
+ none: false
68
+ requirements:
69
+ - - ! '>='
70
+ - !ruby/object:Gem::Version
71
+ version: '0'
72
+ required_rubygems_version: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ requirements: []
79
+ rubyforge_project:
80
+ rubygems_version: 1.6.2
81
+ signing_key:
82
+ specification_version: 3
83
+ summary: Speech to Text Library
84
+ test_files: []