RubyGems - speech2text - Versions diffs - 0.3.1 → 0.3.2 - Mend

speech2text 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/bin/speech2text +5 -0
data/lib/speech/audio_inspector.rb +1 -1
data/lib/speech/audio_splitter.rb +20 -4
data/lib/speech/audio_to_text.rb +13 -21
data/lib/speech/version.rb +1 -1
data/test/audio_to_text_test.rb +18 -0
metadata +6 -6

data/bin/speech2text CHANGED

@@ -3,5 +3,10 @@
 require 'speech'
+if ARGV[0].nil? || !File.exist?(ARGV[0])
+  STDERR.puts "usage: #{$0} input.wav"
+  exit(1)
+end
 captured_json = Speech::AudioToText.new(ARGV[0]).to_text
 puts captured_json.inspect

data/lib/speech/audio_inspector.rb CHANGED

@@ -14,7 +14,7 @@ module Speech
       def to_s
         s,f = seconds.split('.')
-        sprintf "%.2d:%.2d:%.2d:%.2d", self.hours.to_s.gsub(/^0/,'').to_i, self.minutes.to_s.gsub(/^0/,'').to_i, s.to_s.gsub(/^0/,'').to_i, (f||0)
+        sprintf "%.2d:%.2d:%.2d:%.2d", self.hours.to_i, self.minutes.to_i, s.to_i, (f||0).to_i
         #"#{hours}:#{minutes}:#{seconds}:#{f}"
       end

data/lib/speech/audio_splitter.rb CHANGED

@@ -5,22 +5,33 @@ module Speech
     attr_accessor :original_file, :size, :duration, :chunks
     class AudioChunk
-      attr_accessor :splitter, :chunk, :flac_chunk, :offset, :duration, :flac_rate
+      attr_accessor :splitter, :chunk, :flac_chunk, :offset, :duration, :flac_rate, :copied
       def initialize(splitter, offset, duration)
         self.offset = offset
         self.chunk = File.join(File.dirname(splitter.original_file), "chunk-" + File.basename(splitter.original_file).gsub(/\.(.*)$/, "-#{offset}" + '.\1'))
         self.duration = duration
         self.splitter = splitter
+        self.copied = false
+      end
+      def self.copy(splitter)
+        chunk = AudioChunk.new(splitter, 0, splitter.duration.to_f)
+        chunk.copied = true
+        system("cp #{splitter.original_file} #{chunk.chunk}")
+        chunk
       end
       # given the original file from the splitter and the chunked file name with duration and offset run the ffmpeg command
       def build
+        return self if self.copied
         # ffmpeg -y -i sample.audio.wav -acodec copy -vcodec copy -ss 00:00:00:00 -t 00:00:30:00 sample.audio.out.wav
         offset_ts = AudioInspector::Duration.from_seconds(self.offset)
         duration_ts = AudioInspector::Duration.from_seconds(self.duration)
+        # NOTE: kind of a hack, but if the original source is less than or equal to 1 second, we should skip ffmpeg
+        puts "building chunk: #{duration_ts.inspect} and offset: #{offset_ts}"
         #puts "offset: #{ offset_ts.to_s }, duration: #{duration_ts.to_s}"
-        cmd = "ffmpeg -y -i #{splitter.original_file} -acodec copy -vcodec copy -ss #{offset_ts} -t #{duration_ts} #{self.chunk} >/dev/null 2>&1"
+        cmd = "ffmpeg -y -i #{splitter.original_file} -acodec copy -vcodec copy -ss #{offset_ts} -t #{duration_ts} #{self.chunk}"# >/dev/null 2>&1"
         if system(cmd)
           self
         else
@@ -30,7 +41,9 @@ module Speech
       # convert the audio file to flac format
       def to_flac
-        if system("flac #{chunk} >/dev/null 2>&1")
+        puts "convert: #{chunk} to flac"
+        if system("flac #{chunk}")
+          puts "success?"
           self.flac_chunk = chunk.gsub(File.extname(chunk), ".flac")
           # convert the audio file to 16K
           self.flac_rate = `ffmpeg -i #{self.flac_chunk} 2>&1`.strip.scan(/Audio: flac, (.*) Hz/).first.first.strip
@@ -42,6 +55,8 @@ module Speech
             raise "failed to convert to lower audio rate"
           end
+        else
+          raise "failed to convert chunk: #{chunk} with flac #{chunk}"
         end
       end
@@ -75,10 +90,11 @@ module Speech
       end
       if chunks.empty?
-        chunks << AudioChunk.new(self, 0, self.duration.to_f)
+        chunks << AudioChunk.copy(self)#, 0, self.duration.to_f)
       else
         chunks << AudioChunk.new(self, chunks.last.offset.to_i + chunks.last.duration.to_i, self.size + last_chunk)
       end
+      puts "Chunk count: #{chunks.size}"
       chunks
     end

data/lib/speech/audio_to_text.rb CHANGED

@@ -2,17 +2,16 @@
 module Speech
   class AudioToText
-    attr_accessor :file, :rate, :captured_json, :confidence, :captured_file
+    attr_accessor :file, :rate, :captured_json, :captured_file
     def initialize(file)
       self.file = file
       self.captured_file = self.file.gsub(/\.wav$/,'.json')
-      self.captured_json = []
-      self.confidence = 0.0
+      self.captured_json = {}
     end
     def to_text
-      url = "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=speech2text&lang=en-US&maxresults=1"
+      url = "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=speech2text&lang=en-US&maxresults=10"
       splitter = Speech::AudioSplitter.new(file) # based off the wave file because flac doesn't tell us the duration
       easy = Curl::Easy.new(url)
       splitter.split.each do|chunk|
@@ -31,10 +30,12 @@ module Speech
     def convert_chunk(easy, chunk, options={})
       puts "sending chunk of size #{chunk.duration}..."
       retrying = true
-      while retrying
+      retry_count = 0
+      while retrying && retry_count < 5
         #easy.verbose = true
         easy.headers['Content-Type'] = "audio/x-flac; rate=#{chunk.flac_rate}"
         easy.headers['User-Agent'] = "https://github.com/taf2/speech2text"
+        #puts chunk.inspect
         easy.post_body = "Content=#{chunk.to_flac_bytes}"
         easy.on_progress {|dl_total, dl_now, ul_total, ul_now| printf("%.2f/%.2f\r", ul_now, ul_total); true }
         easy.on_complete {|easy| puts }
@@ -44,25 +45,16 @@ module Speech
         if easy.response_code == 500
           puts "500 from google retry after 0.5 seconds"
           retrying = true
-          sleep 0.5 # wait longer on error?
+          retry_count += 1
+          sleep 0.5 # wait longer on error?, google??
         else
           # {"status":0,"id":"ce178ea89f8b17d8e8298c9c7814700a-1","hypotheses":[{"utterance":"I like pickles","confidence":0.92731786}]}
           data = JSON.parse(easy.body_str)
-          puts data.inspect
-          data['hypotheses'].each {|utterance|
-            puts utterance.inspect
-            self.captured_json << [utterance['utterance'], utterance['confidence']]
-            self.confidence += utterance['confidence']
-          }
-          File.open("#{self.captured_file}", "wb") {|f|
-            size = self.captured_json.size
-            if size > 0
-              confidence_calc = self.confidence / size
-            else
-              confidence_calc = 0
-            end
-            f << {:captured_json => captured_json, :confidence => confidence_calc}.to_json
-          }
+          self.captured_json['status'] = data['status']
+          self.captured_json['id'] = data['id']
+          self.captured_json['hypotheses'] = data['hypotheses'].map {|ut| [ut['utterance'], ut['confidence']] }
+          puts self.captured_json.inspect
+          File.open("#{self.captured_file}", "wb") {|f| f << captured_json.to_json }
           retrying = false
         end
         sleep 0.1 # not too fast there tiger

data/lib/speech/version.rb CHANGED

@@ -1,6 +1,6 @@
 # -*- encoding: binary -*-
 module Speech
   class Info
-    VERSION='0.3.1'
+    VERSION='0.3.2'
   end
 end

data/test/audio_to_text_test.rb CHANGED

@@ -18,4 +18,22 @@ class SpeechAudioToTextTest < Test::Unit::TestCase
   ensure
     audio.clean
   end
+  def test_short_audio_clip
+    audio = Speech::AudioToText.new("samples/i-like-pickles.chunk5.wav")
+    captured_json = audio.to_text
+    assert captured_json
+    assert captured_json.key?("hypotheses")
+    assert !captured_json['hypotheses'].empty?
+    #{"status"=>0, "id"=>"552de5ba35bb769ce3493ff113e158a8-1", "hypotheses"=>[["eagles", 0.7214844], ["pickles", nil], ["michaels", nil], ["giggles", nil], ["tickles", nil]]}
+    assert captured_json.keys.include?('status')
+    assert captured_json.keys.include?('id')
+    assert captured_json.keys.include?('hypotheses')
+    puts captured_json.inspect
+    assert_equal "eagles", captured_json['hypotheses'][0].first
+    assert_equal "pickles", captured_json['hypotheses'][1].first
+    #assert captured_json['confidence'] > 0.9
+  ensure
+    audio.clean
+  end
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: speech2text
 version: !ruby/object:Gem::Version
-  version: 0.3.1
+  version: 0.3.2
   prerelease:
 platform: ruby
 authors:
@@ -9,12 +9,12 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-04-03 00:00:00.000000000 -04:00
+date: 2011-04-04 00:00:00.000000000 -04:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
   name: curb
-  requirement: &2163110280 !ruby/object:Gem::Requirement
+  requirement: &2163572140 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -22,10 +22,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *2163110280
+  version_requirements: *2163572140
 - !ruby/object:Gem::Dependency
   name: json
-  requirement: &2163109720 !ruby/object:Gem::Requirement
+  requirement: &2163571700 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -33,7 +33,7 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *2163109720
+  version_requirements: *2163571700
 description: Super powers of Google wrapped in a nice Ruby interface
 email: todd.fisher@gmail.com
 executables: