word_aligner 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 9a4c26333b5e0991c70d0e61ef5435268672a112
4
+ data.tar.gz: 2ec6efcb6606676f2e30bd26425bd21843278018
5
+ SHA512:
6
+ metadata.gz: 199f2c4f5288a60c907a9d842951e740ec712eeaca1a9371a74b96db92d2053107543809b107c23bd025f532f5a75add474f5ff7f12cd163d0c99cb728cb5ffd
7
+ data.tar.gz: 95ae32a917c4e2578bb6087973c0cf195eb4b41b56e771b37d1d138b336440163e8bee9e1ebf015bb48b9690831654ed08fcce8c962dcbae3f11d765f9f01d86
data/Gemfile ADDED
@@ -0,0 +1,18 @@
1
+ source "http://rubygems.org"
2
+ # Add dependencies required to use your gem here.
3
+ # Example:
4
+ # gem "activesupport", ">= 2.3.5"
5
+
6
+ # Add dependencies to develop your gem here.
7
+ # Include everything needed to run rake, tests, features, etc.
8
+ group :development do
9
+ gem "guard"
10
+ gem "guard-rspec"
11
+ gem "shoulda", ">= 0"
12
+ gem "rdoc", "~> 3.12"
13
+ gem "bundler", "~> 1.0"
14
+ gem "jeweler", "~> 1.8.7"
15
+ gem "unicode_utils"
16
+ gem "rspec-core"
17
+ gem "guard-rspec"
18
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,115 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ activesupport (4.0.0)
5
+ i18n (~> 0.6, >= 0.6.4)
6
+ minitest (~> 4.2)
7
+ multi_json (~> 1.3)
8
+ thread_safe (~> 0.1)
9
+ tzinfo (~> 0.3.37)
10
+ addressable (2.3.5)
11
+ atomic (1.1.13)
12
+ builder (3.2.2)
13
+ coderay (1.0.9)
14
+ diff-lcs (1.2.4)
15
+ faraday (0.8.8)
16
+ multipart-post (~> 1.2.0)
17
+ ffi (1.9.0)
18
+ formatador (0.2.4)
19
+ git (1.2.6)
20
+ github_api (0.10.1)
21
+ addressable
22
+ faraday (~> 0.8.1)
23
+ hashie (>= 1.2)
24
+ multi_json (~> 1.4)
25
+ nokogiri (~> 1.5.2)
26
+ oauth2
27
+ guard (1.8.2)
28
+ formatador (>= 0.2.4)
29
+ listen (>= 1.0.0)
30
+ lumberjack (>= 1.0.2)
31
+ pry (>= 0.9.10)
32
+ thor (>= 0.14.6)
33
+ guard-rspec (3.0.3)
34
+ guard (>= 1.8)
35
+ rspec (~> 2.13)
36
+ hashie (2.0.5)
37
+ highline (1.6.19)
38
+ httpauth (0.2.0)
39
+ i18n (0.6.5)
40
+ jeweler (1.8.7)
41
+ builder
42
+ bundler (~> 1.0)
43
+ git (>= 1.2.5)
44
+ github_api (= 0.10.1)
45
+ highline (>= 1.6.15)
46
+ nokogiri (= 1.5.10)
47
+ rake
48
+ rdoc
49
+ json (1.8.0)
50
+ jwt (0.1.8)
51
+ multi_json (>= 1.5)
52
+ listen (1.3.1)
53
+ rb-fsevent (>= 0.9.3)
54
+ rb-inotify (>= 0.9)
55
+ rb-kqueue (>= 0.2)
56
+ lumberjack (1.0.4)
57
+ method_source (0.8.2)
58
+ minitest (4.7.5)
59
+ multi_json (1.8.0)
60
+ multi_xml (0.5.5)
61
+ multipart-post (1.2.0)
62
+ nokogiri (1.5.10)
63
+ oauth2 (0.9.2)
64
+ faraday (~> 0.8)
65
+ httpauth (~> 0.2)
66
+ jwt (~> 0.1.4)
67
+ multi_json (~> 1.0)
68
+ multi_xml (~> 0.5)
69
+ rack (~> 1.2)
70
+ pry (0.9.12.2)
71
+ coderay (~> 1.0.5)
72
+ method_source (~> 0.8)
73
+ slop (~> 3.4)
74
+ rack (1.5.2)
75
+ rake (10.1.0)
76
+ rb-fsevent (0.9.3)
77
+ rb-inotify (0.9.1)
78
+ ffi (>= 0.5.0)
79
+ rb-kqueue (0.2.0)
80
+ ffi (>= 0.5.0)
81
+ rdoc (3.12.2)
82
+ json (~> 1.4)
83
+ rspec (2.14.1)
84
+ rspec-core (~> 2.14.0)
85
+ rspec-expectations (~> 2.14.0)
86
+ rspec-mocks (~> 2.14.0)
87
+ rspec-core (2.14.5)
88
+ rspec-expectations (2.14.2)
89
+ diff-lcs (>= 1.1.3, < 2.0)
90
+ rspec-mocks (2.14.3)
91
+ shoulda (3.5.0)
92
+ shoulda-context (~> 1.0, >= 1.0.1)
93
+ shoulda-matchers (>= 1.4.1, < 3.0)
94
+ shoulda-context (1.1.5)
95
+ shoulda-matchers (2.3.0)
96
+ activesupport (>= 3.0.0)
97
+ slop (3.4.6)
98
+ thor (0.18.1)
99
+ thread_safe (0.1.2)
100
+ atomic
101
+ tzinfo (0.3.37)
102
+ unicode_utils (1.4.0)
103
+
104
+ PLATFORMS
105
+ ruby
106
+
107
+ DEPENDENCIES
108
+ bundler (~> 1.0)
109
+ guard
110
+ guard-rspec
111
+ jeweler (~> 1.8.7)
112
+ rdoc (~> 3.12)
113
+ rspec-core
114
+ shoulda
115
+ unicode_utils
data/Guardfile ADDED
@@ -0,0 +1,8 @@
1
+ # A sample Guardfile
2
+ # More info at https://github.com/guard/guard#readme
3
+
4
+ guard :rspec, cli: '--color --format nested' do
5
+ watch(%r{^spec/.+_spec\.rb$})
6
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
7
+ watch('spec/spec_helper.rb') { "spec" }
8
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2013 Lunatyq, skpvox
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,45 @@
1
+ # word_aligner
2
+
3
+ word_aligner lets you compare string as word_align.pl script does in cmu-sphinx.
4
+
5
+ It calculates distance in terms of insertions, deletions and substitutions.
6
+
7
+ ## Installation
8
+
9
+ Add this line to your Gemfile:
10
+
11
+ ```
12
+ gem 'word_aligner'
13
+ ```
14
+
15
+ Then execute:
16
+
17
+ ```
18
+ $ bundle
19
+ ```
20
+
21
+ ## Usage
22
+
23
+ ```
24
+ error_rate = WordAligner.align('example shown line', 'they shown line')
25
+ error_rate.aligned_transcription => "EXAMPLE shown line"
26
+ error_rate.aligned_hypothesis => "THEY shown line"
27
+ error_rate.words => 3
28
+ error_rate.insertions => 0
29
+ error_rate.deletions => 0
30
+ error_rate.substitutions => 1
31
+
32
+ error_rate.correct => 2
33
+ error_rate.errors => 1
34
+ error_rate.percent_correct => 66.0
35
+ error_rate.percent_error => 33.0
36
+ error_rate.percent_accuracy => 66.0
37
+ ```
38
+
39
+ ## Contributing
40
+
41
+ 1. Fork it
42
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
43
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
44
+ 4. Push to the branch (`git push origin my-new-feature`)
45
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,45 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
+ gem.name = "word_aligner"
18
+ gem.homepage = "http://github.com/lunatyq/word_aligner"
19
+ gem.license = "MIT"
20
+ gem.summary = %Q{TODO: one-line summary of your gem}
21
+ gem.description = %Q{TODO: longer description of your gem}
22
+ gem.email = "maciej@szukio.pl"
23
+ gem.authors = ["Lunatyq"]
24
+ # dependencies defined in Gemfile
25
+ end
26
+ Jeweler::RubygemsDotOrgTasks.new
27
+
28
+ require 'rake/testtask'
29
+ Rake::TestTask.new(:test) do |test|
30
+ test.libs << 'lib' << 'test'
31
+ test.pattern = 'test/**/test_*.rb'
32
+ test.verbose = true
33
+ end
34
+
35
+ task :default => :test
36
+
37
+ require 'rdoc/task'
38
+ Rake::RDocTask.new do |rdoc|
39
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
40
+
41
+ rdoc.rdoc_dir = 'rdoc'
42
+ rdoc.title = "word_aligner #{version}"
43
+ rdoc.rdoc_files.include('README*')
44
+ rdoc.rdoc_files.include('lib/**/*.rb')
45
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,11 @@
1
+ require 'word_aligner/aligner'
2
+ require 'word_aligner/word_error_rate'
3
+
4
+ module WordAligner
5
+
6
+ module_function
7
+
8
+ def align(transcription, hypothesis)
9
+ Aligner.new(transcription, hypothesis).word_error_rate
10
+ end
11
+ end
@@ -0,0 +1,170 @@
1
+ require 'unicode_utils/upcase'
2
+
3
+ module WordAligner
4
+ class Aligner
5
+
6
+ INSERTION = 1
7
+ DELETION = 2
8
+ MATCHING = 3
9
+ SUBSTITUTE = 4
10
+
11
+ attr_accessor :insertions, :substitutions, :deletions, :matching, :align_cost
12
+ attr_accessor :aligned_transcription, :aligned_hypothesis
13
+
14
+ attr_reader :transcription, :hypothesis
15
+ attr_reader :reference_tokens, :hypothese_tokens
16
+
17
+ attr_reader :align_matrix, :backtrace_matrix
18
+
19
+ def initialize(transcription, hypothesis)
20
+ @transcription = transcription
21
+ @hypothesis = hypothesis
22
+
23
+ @insertions = 0
24
+ @substitutions = 0
25
+ @deletions = 0
26
+ @matching = 0
27
+
28
+ @reference_tokens = tokenize(transcription)
29
+ @hypothese_tokens = tokenize(hypothesis)
30
+ end
31
+
32
+ def word_error_rate
33
+ align_sentences if align_matrix.nil?
34
+
35
+ WordErrorRate.new(result)
36
+ end
37
+
38
+ def self.align(original, hypothesis)
39
+ new(original, hypothesis).result
40
+ end
41
+
42
+ private
43
+
44
+ def result
45
+ {
46
+ aligned_transcription: aligned_transcription,
47
+ aligned_hypothesis: aligned_hypothesis,
48
+ transcription: transcription,
49
+ hypothesis: hypothesis,
50
+ transcription_words: reference_tokens.size,
51
+ insertions: insertions,
52
+ deletions: deletions,
53
+ substitutions: substitutions,
54
+ matching: matching,
55
+ align_cost: align_cost
56
+ }
57
+ end
58
+
59
+ def align_sentences
60
+ initialize_matrices
61
+
62
+ align_tokens
63
+
64
+ generate_aligned_sentences
65
+ end
66
+
67
+ def generate_aligned_sentences
68
+ backtrace.each do |ref, hyp|
69
+ width = 0
70
+
71
+ if ref && hyp and (ref == hyp)
72
+ ref = ref.downcase
73
+ hyp = hyp.downcase
74
+ end
75
+
76
+ ref ||= '***'
77
+ hyp ||= '***'
78
+
79
+ width = [ref.size, hyp.size, width, 3].max
80
+
81
+ aligned_transcription << '%-*s ' % [width, ref]
82
+ aligned_hypothesis << '%-*s ' % [width, hyp]
83
+ end
84
+
85
+ aligned_transcription.strip!
86
+ aligned_hypothesis.strip!
87
+ end
88
+
89
+ def initialize_matrices
90
+ @align_matrix = []
91
+ @backtrace_matrix = []
92
+
93
+ align_matrix << (0..hypothese_tokens.size).to_a
94
+
95
+ backtrace_matrix << Array.new(hypothese_tokens.size+1, INSERTION)
96
+
97
+ (0..reference_tokens.size).each { |j| align_matrix[j] ||= []; align_matrix[j][0] = j }
98
+ (0..reference_tokens.size).each { |i| backtrace_matrix[i] ||= []; backtrace_matrix[i][0] = DELETION }
99
+
100
+ self.aligned_transcription = ''
101
+ self.aligned_hypothesis = ''
102
+ end
103
+
104
+ def align_tokens
105
+ (1..reference_tokens.size).each do |i|
106
+ (1..hypothese_tokens.size).each do |j|
107
+ cost = reference_tokens[i - 1] != hypothese_tokens[j - 1] ? 1 : 0
108
+
109
+ ins = align_matrix[i][j - 1] + 1
110
+ del = align_matrix[i - 1][j] + 1
111
+ subst = align_matrix[i - 1][j - 1] + cost
112
+ min = [ins,del,subst].min
113
+
114
+ align_matrix[i][j] = min
115
+
116
+ if min == subst
117
+ backtrace_matrix[i][j] = MATCHING + cost
118
+ elsif min == ins
119
+ backtrace_matrix[i][j] = INSERTION
120
+ elsif min == del
121
+ backtrace_matrix[i][j] = DELETION
122
+ end
123
+ end
124
+ end
125
+
126
+ self.align_cost = align_matrix[reference_tokens.size][hypothese_tokens.size]
127
+ end
128
+
129
+ def backtrace
130
+ i = reference_tokens.size
131
+ j = hypothese_tokens.size
132
+
133
+ alignment = []
134
+
135
+ while !(i == 0 and j == 0)
136
+ pointer = backtrace_matrix[i][j]
137
+
138
+ case pointer
139
+ when INSERTION
140
+ alignment.unshift [nil, hypothese_tokens[j - 1]]
141
+ self.insertions += 1
142
+ j -= 1
143
+ when DELETION
144
+ alignment.unshift [reference_tokens[i - 1], nil]
145
+ self.deletions += 1
146
+ i -= 1
147
+ when MATCHING
148
+ alignment.unshift [reference_tokens[i - 1], hypothese_tokens[j - 1]]
149
+ self.matching += 1
150
+ j -= 1
151
+ i -= 1
152
+ when SUBSTITUTE
153
+ alignment.unshift [reference_tokens[i - 1], hypothese_tokens[j - 1]]
154
+ self.substitutions += 1
155
+ j -= 1
156
+ i -= 1
157
+ else
158
+ break
159
+ end
160
+ end
161
+
162
+ alignment
163
+ end
164
+
165
+ def tokenize(word)
166
+ UnicodeUtils.upcase(word.rstrip).split(' ')
167
+ end
168
+
169
+ end
170
+ end