speech2text 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +17 -0
- data/Rakefile +9 -0
- data/bin/speech2text +7 -0
- data/lib/speech.rb +9 -0
- data/lib/speech/audio_inspector.rb +43 -0
- data/lib/speech/audio_splitter.rb +87 -0
- data/lib/speech/audio_to_text.rb +73 -0
- data/lib/speech/version.rb +6 -0
- data/speech2text.gemspec +17 -0
- data/test/SampleAudio.wav +0 -0
- data/test/audio_inspector_test.rb +9 -0
- data/test/audio_splitter_test.rb +26 -0
- data/test/audio_to_text_test.rb +21 -0
- data/test/i-like-pickles.wav +0 -0
- data/test/samples/i-like-pickles.wav +0 -0
- metadata +84 -0
data/README.rdoc
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
== Speech2Text
|
2
|
+
|
3
|
+
Using the power of ffmpeg/flac/Google and ruby here is a simple interface to play with to convert speech to text.
|
4
|
+
|
5
|
+
Using a new undocumentd speech API from Google with the help of this article: http://mikepultz.com/2011/03/accessing-google-speech-api-chrome-11/
|
6
|
+
|
7
|
+
We're able to provide a very simple API in Ruby to decode simple audio to text.
|
8
|
+
|
9
|
+
The API from Google is not yet public and so may change. It also seems to be very fragile as more times than not it will return a 500, so the library has retry code built in - for larger audio files 10+ failures may return before a successful result is retrieved...
|
10
|
+
|
11
|
+
It also appears that the API only likes smaller audio files so there is a built in chunker that allows us to split the audio up into smaller chunks.
|
12
|
+
|
13
|
+
== Example
|
14
|
+
|
15
|
+
audio = Speech::AudioToText.new("i-like-pickles.wav")
|
16
|
+
puts audio.to_text.inspect
|
17
|
+
=> {"captured_json"=>[["I like pickles", 0.92731786]], "confidence"=>0.92731786}
|
data/Rakefile
ADDED
data/bin/speech2text
ADDED
data/lib/speech.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
# -*- encoding: binary -*-
|
2
|
+
module Speech
|
3
|
+
|
4
|
+
class AudioInspector
|
5
|
+
attr_accessor :duration
|
6
|
+
|
7
|
+
class Duration
|
8
|
+
attr_accessor :hours, :minutes, :seconds, :total_seconds
|
9
|
+
|
10
|
+
def initialize(duration_str)
|
11
|
+
self.hours, self.minutes, self.seconds = duration_str.split(':')
|
12
|
+
self.total_seconds = (self.hours.to_i * 3600) + (self.minutes.to_i * 60) + self.seconds.to_f
|
13
|
+
end
|
14
|
+
|
15
|
+
def to_s
|
16
|
+
s,f = seconds.split('.')
|
17
|
+
sprintf "%.2d:%.2d:%.2d:%.2d", self.hours, self.minutes, s, (f||0)
|
18
|
+
#"#{hours}:#{minutes}:#{seconds}:#{f}"
|
19
|
+
end
|
20
|
+
|
21
|
+
def to_f
|
22
|
+
self.total_seconds
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.from_seconds(seconds)
|
26
|
+
duration = Duration.new("00:00:00.00")
|
27
|
+
duration.hours = (seconds.to_i / 3600).to_i
|
28
|
+
duration.minutes = (seconds / 60).to_i
|
29
|
+
duration.seconds = (seconds - (duration.minutes*60) - (duration.hours*3600)).to_s
|
30
|
+
duration.hours = duration.hours.to_s
|
31
|
+
duration.minutes = duration.minutes.to_s
|
32
|
+
|
33
|
+
duration
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
def initialize(file)
|
39
|
+
self.duration = Duration.new(`ffmpeg -i #{file} 2>&1`.strip.scan(/Duration: (.*),/).first.first)
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
# -*- encoding: binary -*-
|
2
|
+
module Speech
|
3
|
+
|
4
|
+
class AudioSplitter
|
5
|
+
attr_accessor :original_file, :size, :duration, :chunks
|
6
|
+
|
7
|
+
class AudioChunk
|
8
|
+
attr_accessor :splitter, :chunk, :flac_chunk, :offset, :duration, :flac_rate
|
9
|
+
|
10
|
+
def initialize(splitter, offset, duration)
|
11
|
+
self.offset = offset
|
12
|
+
self.chunk = "chunk-" + splitter.original_file.gsub(/\.(.*)$/, "-#{offset}" + '.\1')
|
13
|
+
self.duration = duration
|
14
|
+
self.splitter = splitter
|
15
|
+
end
|
16
|
+
|
17
|
+
# given the original file from the splitter and the chunked file name with duration and offset run the ffmpeg command
|
18
|
+
def build
|
19
|
+
# ffmpeg -y -i sample.audio.wav -acodec copy -vcodec copy -ss 00:00:00:00 -t 00:00:30:00 sample.audio.out.wav
|
20
|
+
offset_ts = AudioInspector::Duration.from_seconds(self.offset)
|
21
|
+
duration_ts = AudioInspector::Duration.from_seconds(self.duration)
|
22
|
+
#puts "offset: #{ offset_ts.to_s }, duration: #{duration_ts.to_s}"
|
23
|
+
cmd = "ffmpeg -y -i #{splitter.original_file} -acodec copy -vcodec copy -ss #{offset_ts} -t #{duration_ts} #{self.chunk} >/dev/null 2>&1"
|
24
|
+
if system(cmd)
|
25
|
+
self
|
26
|
+
else
|
27
|
+
raise "Failed to generate chunk at offset: #{offset_ts}, duration: #{duration_ts}\n#{cmd}"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# convert the audio file to flac format
|
32
|
+
def to_flac
|
33
|
+
if system("flac #{chunk} >/dev/null 2>&1")
|
34
|
+
self.flac_chunk = chunk.gsub(File.extname(chunk), ".flac")
|
35
|
+
# convert the audio file to 16K
|
36
|
+
self.flac_rate = `ffmpeg -i #{self.flac_chunk} 2>&1`.strip.scan(/Audio: flac, (.*) Hz/).first.first.strip
|
37
|
+
down_sampled = self.flac_chunk.gsub(/\.flac$/, '-sampled.flac')
|
38
|
+
if system("ffmpeg -i #{self.flac_chunk} -ar 16000 -y #{down_sampled} >/dev/null 2>&1")
|
39
|
+
system("mv #{down_sampled} #{self.flac_chunk} 2>&1 >/dev/null")
|
40
|
+
self.flac_rate = 16000
|
41
|
+
else
|
42
|
+
raise "failed to convert to lower audio rate"
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def to_flac_bytes
|
49
|
+
File.read(self.flac_chunk)
|
50
|
+
end
|
51
|
+
|
52
|
+
# delete the chunk file
|
53
|
+
def clean
|
54
|
+
File.unlink self.chunk if File.exist?(self.chunk)
|
55
|
+
File.unlink self.flac_chunk if self.flac_chunk && File.exist?(self.flac_chunk)
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
def initialize(file, chunk_size=30)
|
61
|
+
self.original_file = file
|
62
|
+
self.duration = AudioInspector.new(file).duration
|
63
|
+
self.size = chunk_size
|
64
|
+
self.chunks = []
|
65
|
+
end
|
66
|
+
|
67
|
+
def split
|
68
|
+
# compute the total number of chunks
|
69
|
+
full_chunks = (self.duration.to_f / size).to_i
|
70
|
+
last_chunk = ((self.duration.to_f % size) * 100).round / 100.0
|
71
|
+
#puts "generate: #{full_chunks} chunks of #{size} seconds, last: #{last_chunk} seconds"
|
72
|
+
|
73
|
+
(full_chunks-1).times do|chunkid|
|
74
|
+
chunks << AudioChunk.new(self, chunkid * self.size, self.size)
|
75
|
+
end
|
76
|
+
|
77
|
+
if chunks.empty?
|
78
|
+
chunks << AudioChunk.new(self, 0, self.duration.to_f)
|
79
|
+
else
|
80
|
+
chunks << AudioChunk.new(self, chunks.last.offset.to_i + chunks.last.duration.to_i, self.size + last_chunk)
|
81
|
+
end
|
82
|
+
|
83
|
+
chunks
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
# -*- encoding: binary -*-
|
2
|
+
module Speech
|
3
|
+
|
4
|
+
class AudioToText
|
5
|
+
attr_accessor :file, :rate, :captured_json, :confidence, :captured_file
|
6
|
+
|
7
|
+
def initialize(file)
|
8
|
+
self.file = file
|
9
|
+
self.captured_file = self.file.gsub(/\.wav$/,'.json')
|
10
|
+
self.captured_json = []
|
11
|
+
self.confidence = 0.0
|
12
|
+
end
|
13
|
+
|
14
|
+
def to_text
|
15
|
+
url = "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=speech2text&lang=en-US"
|
16
|
+
splitter = Speech::AudioSplitter.new(file) # based off the wave file because flac doesn't tell us the duration
|
17
|
+
easy = Curl::Easy.new(url)
|
18
|
+
splitter.split.each do|chunk|
|
19
|
+
chunk.build.to_flac
|
20
|
+
convert_chunk(easy, chunk)
|
21
|
+
end
|
22
|
+
JSON.parse(File.read(self.captured_file))
|
23
|
+
end
|
24
|
+
|
25
|
+
def clean
|
26
|
+
File.unlink self.captured_file if self.captured_file && File.exist?(self.captured_file)
|
27
|
+
end
|
28
|
+
|
29
|
+
protected
|
30
|
+
|
31
|
+
def convert_chunk(easy, chunk, options={})
|
32
|
+
puts "sending chunk of size #{chunk.duration}..."
|
33
|
+
retrying = true
|
34
|
+
while retrying
|
35
|
+
#easy.verbose = true
|
36
|
+
easy.headers['Content-Type'] = "audio/x-flac; rate=#{chunk.flac_rate}"
|
37
|
+
easy.post_body = "Content=#{chunk.to_flac_bytes}"
|
38
|
+
easy.on_progress {|dl_total, dl_now, ul_total, ul_now| printf("%.2f/%.2f\r", ul_now, ul_total); true }
|
39
|
+
easy.on_complete {|easy| puts }
|
40
|
+
easy.http_post
|
41
|
+
#puts easy.header_str
|
42
|
+
#puts easy.body_str
|
43
|
+
if easy.response_code == 500
|
44
|
+
puts "500 from google retry after 0.5 seconds"
|
45
|
+
retrying = true
|
46
|
+
sleep 0.5 # wait longer on error?
|
47
|
+
else
|
48
|
+
# {"status":0,"id":"ce178ea89f8b17d8e8298c9c7814700a-1","hypotheses":[{"utterance":"I like pickles","confidence":0.92731786}]}
|
49
|
+
data = JSON.parse(easy.body_str)
|
50
|
+
data['hypotheses'].each {|utterance|
|
51
|
+
self.captured_json << [utterance['utterance'], utterance['confidence']]
|
52
|
+
self.confidence += utterance['confidence']
|
53
|
+
}
|
54
|
+
File.open("#{self.captured_file}", "wb") {|f|
|
55
|
+
size = self.captured_json.size
|
56
|
+
if size > 0
|
57
|
+
confidence_calc = self.confidence / size
|
58
|
+
else
|
59
|
+
confidence_calc = 0
|
60
|
+
end
|
61
|
+
f << {:captured_json => captured_json, :confidence => confidence_calc}.to_json
|
62
|
+
}
|
63
|
+
retrying = false
|
64
|
+
end
|
65
|
+
sleep 0.1 # not too fast there tiger
|
66
|
+
end
|
67
|
+
ensure
|
68
|
+
chunk.clean
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
data/speech2text.gemspec
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
$:.unshift File.expand_path(File.dirname(__FILE__) + "/lib")
|
2
|
+
require "speech/version"
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = "speech2text"
|
6
|
+
s.authors = ["Todd A. Fisher"]
|
7
|
+
s.email = "todd.fisher@gmail.com"
|
8
|
+
s.version = Speech::Info::VERSION
|
9
|
+
s.homepage = "https://github.com/taf2/speech2text"
|
10
|
+
s.summary = "Speech to Text Library"
|
11
|
+
s.description = "Super powers of Google wrapped in a nice Ruby interface"
|
12
|
+
s.files = Dir["{lib,bin,test}/**/*", "Rakefile", "README.rdoc", "*.gemspec"]
|
13
|
+
s.executables = %w(speech2text)
|
14
|
+
|
15
|
+
s.add_dependency "curb"
|
16
|
+
s.add_dependency "json"
|
17
|
+
end
|
Binary file
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# -*- encoding: binary -*-
|
2
|
+
require 'test/unit'
|
3
|
+
$:.unshift File.expand_path(File.dirname(__FILE__) + '/../lib')
|
4
|
+
require 'speech'
|
5
|
+
|
6
|
+
class SpeechAudioSplitterTest < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def test_audio_splitter
|
9
|
+
splitter = Speech::AudioSplitter.new("i-like-pickles.wav", 1)
|
10
|
+
|
11
|
+
assert_equal '00:00:03:52', splitter.duration.to_s
|
12
|
+
assert_equal 3.52, splitter.duration.to_f
|
13
|
+
|
14
|
+
chunks = splitter.split
|
15
|
+
assert_equal 3, chunks.size
|
16
|
+
chunks.each do|chunk|
|
17
|
+
chunk.build.to_flac
|
18
|
+
assert File.exist? chunk.chunk
|
19
|
+
assert File.exist? chunk.flac_chunk
|
20
|
+
chunk.clean
|
21
|
+
assert !File.exist?(chunk.chunk)
|
22
|
+
assert !File.exist?(chunk.flac_chunk)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# -*- encoding: binary -*-
|
2
|
+
require 'test/unit'
|
3
|
+
$:.unshift File.expand_path(File.dirname(__FILE__) + '/../lib')
|
4
|
+
require 'speech'
|
5
|
+
|
6
|
+
class SpeechAudioToTextTest < Test::Unit::TestCase
|
7
|
+
def test_audio_to_text
|
8
|
+
audio = Speech::AudioToText.new("i-like-pickles.wav")
|
9
|
+
captured_json = audio.to_text
|
10
|
+
assert captured_json
|
11
|
+
assert captured_json.key?("captured_json")
|
12
|
+
assert !captured_json['captured_json'].empty?
|
13
|
+
assert_equal ['captured_json', 'confidence'], captured_json.keys.sort
|
14
|
+
assert_equal "I like pickles", captured_json['captured_json'].flatten.first
|
15
|
+
assert captured_json['confidence'] > 0.9
|
16
|
+
# {"captured_json"=>[["I like pickles", 0.92731786]], "confidence"=>0.92731786}
|
17
|
+
# puts captured_json.inspect
|
18
|
+
ensure
|
19
|
+
audio.clean
|
20
|
+
end
|
21
|
+
end
|
Binary file
|
Binary file
|
metadata
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: speech2text
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.2
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Todd A. Fisher
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-03-24 00:00:00.000000000 -04:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: curb
|
17
|
+
requirement: &2157003720 !ruby/object:Gem::Requirement
|
18
|
+
none: false
|
19
|
+
requirements:
|
20
|
+
- - ! '>='
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: '0'
|
23
|
+
type: :runtime
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: *2157003720
|
26
|
+
- !ruby/object:Gem::Dependency
|
27
|
+
name: json
|
28
|
+
requirement: &2157003280 !ruby/object:Gem::Requirement
|
29
|
+
none: false
|
30
|
+
requirements:
|
31
|
+
- - ! '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: *2157003280
|
37
|
+
description: Super powers of Google wrapped in a nice Ruby interface
|
38
|
+
email: todd.fisher@gmail.com
|
39
|
+
executables:
|
40
|
+
- speech2text
|
41
|
+
extensions: []
|
42
|
+
extra_rdoc_files: []
|
43
|
+
files:
|
44
|
+
- lib/speech/audio_inspector.rb
|
45
|
+
- lib/speech/audio_splitter.rb
|
46
|
+
- lib/speech/audio_to_text.rb
|
47
|
+
- lib/speech/version.rb
|
48
|
+
- lib/speech.rb
|
49
|
+
- bin/speech2text
|
50
|
+
- test/audio_inspector_test.rb
|
51
|
+
- test/audio_splitter_test.rb
|
52
|
+
- test/audio_to_text_test.rb
|
53
|
+
- test/i-like-pickles.wav
|
54
|
+
- test/SampleAudio.wav
|
55
|
+
- test/samples/i-like-pickles.wav
|
56
|
+
- Rakefile
|
57
|
+
- README.rdoc
|
58
|
+
- speech2text.gemspec
|
59
|
+
has_rdoc: true
|
60
|
+
homepage: https://github.com/taf2/speech2text
|
61
|
+
licenses: []
|
62
|
+
post_install_message:
|
63
|
+
rdoc_options: []
|
64
|
+
require_paths:
|
65
|
+
- lib
|
66
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
67
|
+
none: false
|
68
|
+
requirements:
|
69
|
+
- - ! '>='
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: '0'
|
72
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
requirements: []
|
79
|
+
rubyforge_project:
|
80
|
+
rubygems_version: 1.6.2
|
81
|
+
signing_key:
|
82
|
+
specification_version: 3
|
83
|
+
summary: Speech to Text Library
|
84
|
+
test_files: []
|