RubyGems - speech2text - Versions diffs - 0.0.2 - Mend

speech2text 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

data/README.rdoc +17 -0
data/Rakefile +9 -0
data/bin/speech2text +7 -0
data/lib/speech.rb +9 -0
data/lib/speech/audio_inspector.rb +43 -0
data/lib/speech/audio_splitter.rb +87 -0
data/lib/speech/audio_to_text.rb +73 -0
data/lib/speech/version.rb +6 -0
data/speech2text.gemspec +17 -0
data/test/SampleAudio.wav +0 -0
data/test/audio_inspector_test.rb +9 -0
data/test/audio_splitter_test.rb +26 -0
data/test/audio_to_text_test.rb +21 -0
data/test/i-like-pickles.wav +0 -0
data/test/samples/i-like-pickles.wav +0 -0
metadata +84 -0

data/README.rdoc ADDED Viewed

@@ -0,0 +1,17 @@
+== Speech2Text
+Using the power of ffmpeg/flac/Google and ruby here is a simple interface to play with to convert speech to text.
+Using a new undocumentd speech API from Google with the help of this article: http://mikepultz.com/2011/03/accessing-google-speech-api-chrome-11/
+We're able to provide a very simple API in Ruby to decode simple audio to text.
+The API from Google is not yet public and so may change. It also seems to be very fragile as more times than not it will return a 500, so the library has retry code built in - for larger audio files 10+ failures may return before a successful result is retrieved...
+It also appears that the API only likes smaller audio files so there is a built in chunker that allows us to split the audio up into smaller chunks.
+== Example
+    audio = Speech::AudioToText.new("i-like-pickles.wav")
+    puts audio.to_text.inspect
+    => {"captured_json"=>[["I like pickles", 0.92731786]], "confidence"=>0.92731786}

data/Rakefile ADDED Viewed

@@ -0,0 +1,9 @@
+require 'rake/testtask'
+desc "Default Task (Test project)"
+task :default => :test
+Rake::TestTask.new(:test) do |t|
+  t.test_files = FileList['test/*_test.rb']
+  t.verbose = false
+end

data/bin/speech2text ADDED Viewed

@@ -0,0 +1,7 @@
+#!/this/will/be/replaced/by/rubygems
+# -*- encoding: binary -*-
+require 'speech'
+captured_json = Speech::AudioToText.new(ARGV[0]).to_text
+puts captured_json.inspect

data/lib/speech.rb ADDED Viewed

@@ -0,0 +1,9 @@
+# -*- encoding: binary -*-
+require 'curb'
+require 'json'
+module Speech; end
+require 'speech/audio_inspector'
+require 'speech/audio_splitter'
+require 'speech/audio_to_text'

data/lib/speech/audio_inspector.rb ADDED Viewed

@@ -0,0 +1,43 @@
+# -*- encoding: binary -*-
+module Speech
+  class AudioInspector
+    attr_accessor :duration
+    class Duration
+      attr_accessor :hours, :minutes, :seconds, :total_seconds
+      def initialize(duration_str)
+        self.hours, self.minutes, self.seconds = duration_str.split(':')
+        self.total_seconds = (self.hours.to_i * 3600) + (self.minutes.to_i * 60) + self.seconds.to_f
+      end
+      def to_s
+        s,f = seconds.split('.')
+        sprintf "%.2d:%.2d:%.2d:%.2d", self.hours, self.minutes, s, (f||0)
+        #"#{hours}:#{minutes}:#{seconds}:#{f}"
+      end
+      def to_f
+        self.total_seconds
+      end
+      def self.from_seconds(seconds)
+        duration = Duration.new("00:00:00.00")
+        duration.hours = (seconds.to_i / 3600).to_i
+        duration.minutes = (seconds / 60).to_i
+        duration.seconds = (seconds - (duration.minutes*60) - (duration.hours*3600)).to_s
+        duration.hours = duration.hours.to_s
+        duration.minutes = duration.minutes.to_s
+        duration
+      end
+    end
+    def initialize(file)
+      self.duration = Duration.new(`ffmpeg -i #{file} 2>&1`.strip.scan(/Duration: (.*),/).first.first)
+    end
+  end
+end

data/lib/speech/audio_splitter.rb ADDED Viewed

@@ -0,0 +1,87 @@
+# -*- encoding: binary -*-
+module Speech
+  class AudioSplitter
+    attr_accessor :original_file, :size, :duration, :chunks
+    class AudioChunk
+      attr_accessor :splitter, :chunk, :flac_chunk, :offset, :duration, :flac_rate
+      def initialize(splitter, offset, duration)
+        self.offset = offset
+        self.chunk = "chunk-" + splitter.original_file.gsub(/\.(.*)$/, "-#{offset}" + '.\1')
+        self.duration = duration
+        self.splitter = splitter
+      end
+      # given the original file from the splitter and the chunked file name with duration and offset run the ffmpeg command
+      def build
+        # ffmpeg -y -i sample.audio.wav -acodec copy -vcodec copy -ss 00:00:00:00 -t 00:00:30:00 sample.audio.out.wav
+        offset_ts = AudioInspector::Duration.from_seconds(self.offset)
+        duration_ts = AudioInspector::Duration.from_seconds(self.duration)
+        #puts "offset: #{ offset_ts.to_s }, duration: #{duration_ts.to_s}"
+        cmd = "ffmpeg -y -i #{splitter.original_file} -acodec copy -vcodec copy -ss #{offset_ts} -t #{duration_ts} #{self.chunk} >/dev/null 2>&1"
+        if system(cmd)
+          self
+        else
+          raise "Failed to generate chunk at offset: #{offset_ts}, duration: #{duration_ts}\n#{cmd}"
+        end
+      end
+      # convert the audio file to flac format
+      def to_flac
+        if system("flac #{chunk} >/dev/null 2>&1")
+          self.flac_chunk = chunk.gsub(File.extname(chunk), ".flac")
+          # convert the audio file to 16K
+          self.flac_rate = `ffmpeg -i #{self.flac_chunk} 2>&1`.strip.scan(/Audio: flac, (.*) Hz/).first.first.strip
+          down_sampled = self.flac_chunk.gsub(/\.flac$/, '-sampled.flac')
+          if system("ffmpeg -i #{self.flac_chunk} -ar 16000 -y #{down_sampled} >/dev/null 2>&1")
+            system("mv #{down_sampled} #{self.flac_chunk} 2>&1 >/dev/null")
+            self.flac_rate = 16000
+          else
+            raise "failed to convert to lower audio rate"
+          end
+        end
+      end
+      def to_flac_bytes
+        File.read(self.flac_chunk)
+      end
+      # delete the chunk file
+      def clean
+        File.unlink self.chunk if File.exist?(self.chunk)
+        File.unlink self.flac_chunk if self.flac_chunk && File.exist?(self.flac_chunk)
+      end
+    end
+    def initialize(file, chunk_size=30)
+      self.original_file = file
+      self.duration = AudioInspector.new(file).duration
+      self.size = chunk_size
+      self.chunks = []
+    end
+    def split
+      # compute the total number of chunks
+      full_chunks = (self.duration.to_f / size).to_i
+      last_chunk = ((self.duration.to_f % size) * 100).round / 100.0
+      #puts "generate: #{full_chunks} chunks of #{size} seconds, last: #{last_chunk} seconds"
+      (full_chunks-1).times do|chunkid|
+        chunks << AudioChunk.new(self, chunkid * self.size, self.size)
+      end
+      if chunks.empty?
+        chunks << AudioChunk.new(self, 0, self.duration.to_f)
+      else
+        chunks << AudioChunk.new(self, chunks.last.offset.to_i + chunks.last.duration.to_i, self.size + last_chunk)
+      end
+      chunks
+    end
+  end
+end

data/lib/speech/audio_to_text.rb ADDED Viewed

@@ -0,0 +1,73 @@
+# -*- encoding: binary -*-
+module Speech
+  class AudioToText
+    attr_accessor :file, :rate, :captured_json, :confidence, :captured_file
+    def initialize(file)
+      self.file = file
+      self.captured_file = self.file.gsub(/\.wav$/,'.json')
+      self.captured_json = []
+      self.confidence = 0.0
+    end
+    def to_text
+      url = "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=speech2text&lang=en-US"
+      splitter = Speech::AudioSplitter.new(file) # based off the wave file because flac doesn't tell us the duration
+      easy = Curl::Easy.new(url)
+      splitter.split.each do|chunk|
+        chunk.build.to_flac
+        convert_chunk(easy, chunk)
+      end
+      JSON.parse(File.read(self.captured_file))
+    end
+    def clean
+      File.unlink self.captured_file if self.captured_file && File.exist?(self.captured_file)
+    end
+  protected
+    def convert_chunk(easy, chunk, options={})
+      puts "sending chunk of size #{chunk.duration}..."
+      retrying = true
+      while retrying
+        #easy.verbose = true
+        easy.headers['Content-Type'] = "audio/x-flac; rate=#{chunk.flac_rate}"
+        easy.post_body = "Content=#{chunk.to_flac_bytes}"
+        easy.on_progress {|dl_total, dl_now, ul_total, ul_now| printf("%.2f/%.2f\r", ul_now, ul_total); true }
+        easy.on_complete {|easy| puts }
+        easy.http_post
+        #puts easy.header_str
+        #puts easy.body_str
+        if easy.response_code == 500
+          puts "500 from google retry after 0.5 seconds"
+          retrying = true
+          sleep 0.5 # wait longer on error?
+        else
+          # {"status":0,"id":"ce178ea89f8b17d8e8298c9c7814700a-1","hypotheses":[{"utterance":"I like pickles","confidence":0.92731786}]}
+          data = JSON.parse(easy.body_str)
+          data['hypotheses'].each {|utterance|
+            self.captured_json << [utterance['utterance'], utterance['confidence']]
+            self.confidence += utterance['confidence']
+          }
+          File.open("#{self.captured_file}", "wb") {|f|
+            size = self.captured_json.size
+            if size > 0
+              confidence_calc = self.confidence / size
+            else
+              confidence_calc = 0
+            end
+            f << {:captured_json => captured_json, :confidence => confidence_calc}.to_json
+          }
+          retrying = false
+        end
+        sleep 0.1 # not too fast there tiger
+      end
+    ensure
+      chunk.clean
+    end
+  end
+end

data/lib/speech/version.rb ADDED Viewed

@@ -0,0 +1,6 @@
+# -*- encoding: binary -*-
+module Speech
+  class Info
+    VERSION='0.0.2'
+  end
+end

data/speech2text.gemspec ADDED Viewed

@@ -0,0 +1,17 @@
+$:.unshift File.expand_path(File.dirname(__FILE__) + "/lib")
+require "speech/version"
+Gem::Specification.new do |s|
+  s.name           = "speech2text"
+  s.authors        = ["Todd A. Fisher"]
+  s.email          = "todd.fisher@gmail.com"
+  s.version        = Speech::Info::VERSION
+  s.homepage       = "https://github.com/taf2/speech2text"
+  s.summary        = "Speech to Text Library"
+  s.description    = "Super powers of Google wrapped in a nice Ruby interface"
+  s.files          = Dir["{lib,bin,test}/**/*", "Rakefile", "README.rdoc", "*.gemspec"]
+  s.executables    = %w(speech2text)
+  s.add_dependency "curb"
+  s.add_dependency "json"
+end

data/test/SampleAudio.wav ADDED Viewed

Binary file

data/test/audio_inspector_test.rb ADDED Viewed

@@ -0,0 +1,9 @@
+# -*- encoding: binary -*-
+require 'test/unit'
+$:.unshift File.expand_path(File.dirname(__FILE__) + '/../lib')
+require 'speech'
+class SpeechAudioInspectorTest < Test::Unit::TestCase
+  def test_audio_inspector
+  end
+end

data/test/audio_splitter_test.rb ADDED Viewed

@@ -0,0 +1,26 @@
+# -*- encoding: binary -*-
+require 'test/unit'
+$:.unshift File.expand_path(File.dirname(__FILE__) + '/../lib')
+require 'speech'
+class SpeechAudioSplitterTest < Test::Unit::TestCase
+  def test_audio_splitter
+    splitter = Speech::AudioSplitter.new("i-like-pickles.wav", 1)
+    assert_equal '00:00:03:52', splitter.duration.to_s
+    assert_equal 3.52, splitter.duration.to_f
+    chunks = splitter.split
+    assert_equal 3, chunks.size
+    chunks.each do|chunk|
+      chunk.build.to_flac
+      assert File.exist? chunk.chunk
+      assert File.exist? chunk.flac_chunk
+      chunk.clean
+      assert !File.exist?(chunk.chunk)
+      assert !File.exist?(chunk.flac_chunk)
+    end
+  end
+end

data/test/audio_to_text_test.rb ADDED Viewed

@@ -0,0 +1,21 @@
+# -*- encoding: binary -*-
+require 'test/unit'
+$:.unshift File.expand_path(File.dirname(__FILE__) + '/../lib')
+require 'speech'
+class SpeechAudioToTextTest < Test::Unit::TestCase
+  def test_audio_to_text
+    audio = Speech::AudioToText.new("i-like-pickles.wav")
+    captured_json = audio.to_text
+    assert captured_json
+    assert captured_json.key?("captured_json")
+    assert !captured_json['captured_json'].empty?
+    assert_equal ['captured_json', 'confidence'], captured_json.keys.sort
+    assert_equal "I like pickles", captured_json['captured_json'].flatten.first
+    assert captured_json['confidence'] > 0.9
+#    {"captured_json"=>[["I like pickles", 0.92731786]], "confidence"=>0.92731786}
+#    puts captured_json.inspect
+  ensure
+    audio.clean
+  end
+end

data/test/i-like-pickles.wav ADDED Viewed

Binary file

data/test/samples/i-like-pickles.wav ADDED Viewed

Binary file

metadata ADDED Viewed

@@ -0,0 +1,84 @@
+--- !ruby/object:Gem::Specification
+name: speech2text
+version: !ruby/object:Gem::Version
+  version: 0.0.2
+  prerelease:
+platform: ruby
+authors:
+- Todd A. Fisher
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2011-03-24 00:00:00.000000000 -04:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: curb
+  requirement: &2157003720 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: *2157003720
+- !ruby/object:Gem::Dependency
+  name: json
+  requirement: &2157003280 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: *2157003280
+description: Super powers of Google wrapped in a nice Ruby interface
+email: todd.fisher@gmail.com
+executables:
+- speech2text
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/speech/audio_inspector.rb
+- lib/speech/audio_splitter.rb
+- lib/speech/audio_to_text.rb
+- lib/speech/version.rb
+- lib/speech.rb
+- bin/speech2text
+- test/audio_inspector_test.rb
+- test/audio_splitter_test.rb
+- test/audio_to_text_test.rb
+- test/i-like-pickles.wav
+- test/SampleAudio.wav
+- test/samples/i-like-pickles.wav
+- Rakefile
+- README.rdoc
+- speech2text.gemspec
+has_rdoc: true
+homepage: https://github.com/taf2/speech2text
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 1.6.2
+signing_key:
+specification_version: 3
+summary: Speech to Text Library
+test_files: []