word_aligner 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 9a4c26333b5e0991c70d0e61ef5435268672a112
4
+ data.tar.gz: 2ec6efcb6606676f2e30bd26425bd21843278018
5
+ SHA512:
6
+ metadata.gz: 199f2c4f5288a60c907a9d842951e740ec712eeaca1a9371a74b96db92d2053107543809b107c23bd025f532f5a75add474f5ff7f12cd163d0c99cb728cb5ffd
7
+ data.tar.gz: 95ae32a917c4e2578bb6087973c0cf195eb4b41b56e771b37d1d138b336440163e8bee9e1ebf015bb48b9690831654ed08fcce8c962dcbae3f11d765f9f01d86
data/Gemfile ADDED
@@ -0,0 +1,18 @@
1
+ source "http://rubygems.org"
2
+ # Add dependencies required to use your gem here.
3
+ # Example:
4
+ # gem "activesupport", ">= 2.3.5"
5
+
6
+ # Add dependencies to develop your gem here.
7
+ # Include everything needed to run rake, tests, features, etc.
8
+ group :development do
9
+ gem "guard"
10
+ gem "guard-rspec"
11
+ gem "shoulda", ">= 0"
12
+ gem "rdoc", "~> 3.12"
13
+ gem "bundler", "~> 1.0"
14
+ gem "jeweler", "~> 1.8.7"
15
+ gem "unicode_utils"
16
+ gem "rspec-core"
17
+ gem "guard-rspec"
18
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,115 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ activesupport (4.0.0)
5
+ i18n (~> 0.6, >= 0.6.4)
6
+ minitest (~> 4.2)
7
+ multi_json (~> 1.3)
8
+ thread_safe (~> 0.1)
9
+ tzinfo (~> 0.3.37)
10
+ addressable (2.3.5)
11
+ atomic (1.1.13)
12
+ builder (3.2.2)
13
+ coderay (1.0.9)
14
+ diff-lcs (1.2.4)
15
+ faraday (0.8.8)
16
+ multipart-post (~> 1.2.0)
17
+ ffi (1.9.0)
18
+ formatador (0.2.4)
19
+ git (1.2.6)
20
+ github_api (0.10.1)
21
+ addressable
22
+ faraday (~> 0.8.1)
23
+ hashie (>= 1.2)
24
+ multi_json (~> 1.4)
25
+ nokogiri (~> 1.5.2)
26
+ oauth2
27
+ guard (1.8.2)
28
+ formatador (>= 0.2.4)
29
+ listen (>= 1.0.0)
30
+ lumberjack (>= 1.0.2)
31
+ pry (>= 0.9.10)
32
+ thor (>= 0.14.6)
33
+ guard-rspec (3.0.3)
34
+ guard (>= 1.8)
35
+ rspec (~> 2.13)
36
+ hashie (2.0.5)
37
+ highline (1.6.19)
38
+ httpauth (0.2.0)
39
+ i18n (0.6.5)
40
+ jeweler (1.8.7)
41
+ builder
42
+ bundler (~> 1.0)
43
+ git (>= 1.2.5)
44
+ github_api (= 0.10.1)
45
+ highline (>= 1.6.15)
46
+ nokogiri (= 1.5.10)
47
+ rake
48
+ rdoc
49
+ json (1.8.0)
50
+ jwt (0.1.8)
51
+ multi_json (>= 1.5)
52
+ listen (1.3.1)
53
+ rb-fsevent (>= 0.9.3)
54
+ rb-inotify (>= 0.9)
55
+ rb-kqueue (>= 0.2)
56
+ lumberjack (1.0.4)
57
+ method_source (0.8.2)
58
+ minitest (4.7.5)
59
+ multi_json (1.8.0)
60
+ multi_xml (0.5.5)
61
+ multipart-post (1.2.0)
62
+ nokogiri (1.5.10)
63
+ oauth2 (0.9.2)
64
+ faraday (~> 0.8)
65
+ httpauth (~> 0.2)
66
+ jwt (~> 0.1.4)
67
+ multi_json (~> 1.0)
68
+ multi_xml (~> 0.5)
69
+ rack (~> 1.2)
70
+ pry (0.9.12.2)
71
+ coderay (~> 1.0.5)
72
+ method_source (~> 0.8)
73
+ slop (~> 3.4)
74
+ rack (1.5.2)
75
+ rake (10.1.0)
76
+ rb-fsevent (0.9.3)
77
+ rb-inotify (0.9.1)
78
+ ffi (>= 0.5.0)
79
+ rb-kqueue (0.2.0)
80
+ ffi (>= 0.5.0)
81
+ rdoc (3.12.2)
82
+ json (~> 1.4)
83
+ rspec (2.14.1)
84
+ rspec-core (~> 2.14.0)
85
+ rspec-expectations (~> 2.14.0)
86
+ rspec-mocks (~> 2.14.0)
87
+ rspec-core (2.14.5)
88
+ rspec-expectations (2.14.2)
89
+ diff-lcs (>= 1.1.3, < 2.0)
90
+ rspec-mocks (2.14.3)
91
+ shoulda (3.5.0)
92
+ shoulda-context (~> 1.0, >= 1.0.1)
93
+ shoulda-matchers (>= 1.4.1, < 3.0)
94
+ shoulda-context (1.1.5)
95
+ shoulda-matchers (2.3.0)
96
+ activesupport (>= 3.0.0)
97
+ slop (3.4.6)
98
+ thor (0.18.1)
99
+ thread_safe (0.1.2)
100
+ atomic
101
+ tzinfo (0.3.37)
102
+ unicode_utils (1.4.0)
103
+
104
+ PLATFORMS
105
+ ruby
106
+
107
+ DEPENDENCIES
108
+ bundler (~> 1.0)
109
+ guard
110
+ guard-rspec
111
+ jeweler (~> 1.8.7)
112
+ rdoc (~> 3.12)
113
+ rspec-core
114
+ shoulda
115
+ unicode_utils
data/Guardfile ADDED
@@ -0,0 +1,8 @@
1
+ # A sample Guardfile
2
+ # More info at https://github.com/guard/guard#readme
3
+
4
+ guard :rspec, cli: '--color --format nested' do
5
+ watch(%r{^spec/.+_spec\.rb$})
6
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
7
+ watch('spec/spec_helper.rb') { "spec" }
8
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2013 Lunatyq, skpvox
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,45 @@
1
+ # word_aligner
2
+
3
+ word_aligner lets you compare string as word_align.pl script does in cmu-sphinx.
4
+
5
+ It calculates distance in terms of insertions, deletions and substitutions.
6
+
7
+ ## Installation
8
+
9
+ Add this line to your Gemfile:
10
+
11
+ ```
12
+ gem 'word_aligner'
13
+ ```
14
+
15
+ Then execute:
16
+
17
+ ```
18
+ $ bundle
19
+ ```
20
+
21
+ ## Usage
22
+
23
+ ```
24
+ error_rate = WordAligner.align('example shown line', 'they shown line')
25
+ error_rate.aligned_transcription => "EXAMPLE shown line"
26
+ error_rate.aligned_hypothesis => "THEY shown line"
27
+ error_rate.words => 3
28
+ error_rate.insertions => 0
29
+ error_rate.deletions => 0
30
+ error_rate.substitutions => 1
31
+
32
+ error_rate.correct => 2
33
+ error_rate.errors => 1
34
+ error_rate.percent_correct => 66.0
35
+ error_rate.percent_error => 33.0
36
+ error_rate.percent_accuracy => 66.0
37
+ ```
38
+
39
+ ## Contributing
40
+
41
+ 1. Fork it
42
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
43
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
44
+ 4. Push to the branch (`git push origin my-new-feature`)
45
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,45 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
+ gem.name = "word_aligner"
18
+ gem.homepage = "http://github.com/lunatyq/word_aligner"
19
+ gem.license = "MIT"
20
+ gem.summary = %Q{TODO: one-line summary of your gem}
21
+ gem.description = %Q{TODO: longer description of your gem}
22
+ gem.email = "maciej@szukio.pl"
23
+ gem.authors = ["Lunatyq"]
24
+ # dependencies defined in Gemfile
25
+ end
26
+ Jeweler::RubygemsDotOrgTasks.new
27
+
28
+ require 'rake/testtask'
29
+ Rake::TestTask.new(:test) do |test|
30
+ test.libs << 'lib' << 'test'
31
+ test.pattern = 'test/**/test_*.rb'
32
+ test.verbose = true
33
+ end
34
+
35
+ task :default => :test
36
+
37
+ require 'rdoc/task'
38
+ Rake::RDocTask.new do |rdoc|
39
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
40
+
41
+ rdoc.rdoc_dir = 'rdoc'
42
+ rdoc.title = "word_aligner #{version}"
43
+ rdoc.rdoc_files.include('README*')
44
+ rdoc.rdoc_files.include('lib/**/*.rb')
45
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,11 @@
1
+ require 'word_aligner/aligner'
2
+ require 'word_aligner/word_error_rate'
3
+
4
+ module WordAligner
5
+
6
+ module_function
7
+
8
+ def align(transcription, hypothesis)
9
+ Aligner.new(transcription, hypothesis).word_error_rate
10
+ end
11
+ end
@@ -0,0 +1,170 @@
1
+ require 'unicode_utils/upcase'
2
+
3
+ module WordAligner
4
+ class Aligner
5
+
6
+ INSERTION = 1
7
+ DELETION = 2
8
+ MATCHING = 3
9
+ SUBSTITUTE = 4
10
+
11
+ attr_accessor :insertions, :substitutions, :deletions, :matching, :align_cost
12
+ attr_accessor :aligned_transcription, :aligned_hypothesis
13
+
14
+ attr_reader :transcription, :hypothesis
15
+ attr_reader :reference_tokens, :hypothese_tokens
16
+
17
+ attr_reader :align_matrix, :backtrace_matrix
18
+
19
+ def initialize(transcription, hypothesis)
20
+ @transcription = transcription
21
+ @hypothesis = hypothesis
22
+
23
+ @insertions = 0
24
+ @substitutions = 0
25
+ @deletions = 0
26
+ @matching = 0
27
+
28
+ @reference_tokens = tokenize(transcription)
29
+ @hypothese_tokens = tokenize(hypothesis)
30
+ end
31
+
32
+ def word_error_rate
33
+ align_sentences if align_matrix.nil?
34
+
35
+ WordErrorRate.new(result)
36
+ end
37
+
38
+ def self.align(original, hypothesis)
39
+ new(original, hypothesis).result
40
+ end
41
+
42
+ private
43
+
44
+ def result
45
+ {
46
+ aligned_transcription: aligned_transcription,
47
+ aligned_hypothesis: aligned_hypothesis,
48
+ transcription: transcription,
49
+ hypothesis: hypothesis,
50
+ transcription_words: reference_tokens.size,
51
+ insertions: insertions,
52
+ deletions: deletions,
53
+ substitutions: substitutions,
54
+ matching: matching,
55
+ align_cost: align_cost
56
+ }
57
+ end
58
+
59
+ def align_sentences
60
+ initialize_matrices
61
+
62
+ align_tokens
63
+
64
+ generate_aligned_sentences
65
+ end
66
+
67
+ def generate_aligned_sentences
68
+ backtrace.each do |ref, hyp|
69
+ width = 0
70
+
71
+ if ref && hyp and (ref == hyp)
72
+ ref = ref.downcase
73
+ hyp = hyp.downcase
74
+ end
75
+
76
+ ref ||= '***'
77
+ hyp ||= '***'
78
+
79
+ width = [ref.size, hyp.size, width, 3].max
80
+
81
+ aligned_transcription << '%-*s ' % [width, ref]
82
+ aligned_hypothesis << '%-*s ' % [width, hyp]
83
+ end
84
+
85
+ aligned_transcription.strip!
86
+ aligned_hypothesis.strip!
87
+ end
88
+
89
+ def initialize_matrices
90
+ @align_matrix = []
91
+ @backtrace_matrix = []
92
+
93
+ align_matrix << (0..hypothese_tokens.size).to_a
94
+
95
+ backtrace_matrix << Array.new(hypothese_tokens.size+1, INSERTION)
96
+
97
+ (0..reference_tokens.size).each { |j| align_matrix[j] ||= []; align_matrix[j][0] = j }
98
+ (0..reference_tokens.size).each { |i| backtrace_matrix[i] ||= []; backtrace_matrix[i][0] = DELETION }
99
+
100
+ self.aligned_transcription = ''
101
+ self.aligned_hypothesis = ''
102
+ end
103
+
104
+ def align_tokens
105
+ (1..reference_tokens.size).each do |i|
106
+ (1..hypothese_tokens.size).each do |j|
107
+ cost = reference_tokens[i - 1] != hypothese_tokens[j - 1] ? 1 : 0
108
+
109
+ ins = align_matrix[i][j - 1] + 1
110
+ del = align_matrix[i - 1][j] + 1
111
+ subst = align_matrix[i - 1][j - 1] + cost
112
+ min = [ins,del,subst].min
113
+
114
+ align_matrix[i][j] = min
115
+
116
+ if min == subst
117
+ backtrace_matrix[i][j] = MATCHING + cost
118
+ elsif min == ins
119
+ backtrace_matrix[i][j] = INSERTION
120
+ elsif min == del
121
+ backtrace_matrix[i][j] = DELETION
122
+ end
123
+ end
124
+ end
125
+
126
+ self.align_cost = align_matrix[reference_tokens.size][hypothese_tokens.size]
127
+ end
128
+
129
+ def backtrace
130
+ i = reference_tokens.size
131
+ j = hypothese_tokens.size
132
+
133
+ alignment = []
134
+
135
+ while !(i == 0 and j == 0)
136
+ pointer = backtrace_matrix[i][j]
137
+
138
+ case pointer
139
+ when INSERTION
140
+ alignment.unshift [nil, hypothese_tokens[j - 1]]
141
+ self.insertions += 1
142
+ j -= 1
143
+ when DELETION
144
+ alignment.unshift [reference_tokens[i - 1], nil]
145
+ self.deletions += 1
146
+ i -= 1
147
+ when MATCHING
148
+ alignment.unshift [reference_tokens[i - 1], hypothese_tokens[j - 1]]
149
+ self.matching += 1
150
+ j -= 1
151
+ i -= 1
152
+ when SUBSTITUTE
153
+ alignment.unshift [reference_tokens[i - 1], hypothese_tokens[j - 1]]
154
+ self.substitutions += 1
155
+ j -= 1
156
+ i -= 1
157
+ else
158
+ break
159
+ end
160
+ end
161
+
162
+ alignment
163
+ end
164
+
165
+ def tokenize(word)
166
+ UnicodeUtils.upcase(word.rstrip).split(' ')
167
+ end
168
+
169
+ end
170
+ end