word_aligner 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +18 -0
- data/Gemfile.lock +115 -0
- data/Guardfile +8 -0
- data/LICENSE.txt +20 -0
- data/README.md +45 -0
- data/Rakefile +45 -0
- data/VERSION +1 -0
- data/lib/word_aligner.rb +11 -0
- data/lib/word_aligner/aligner.rb +170 -0
- data/lib/word_aligner/word_error_rate.rb +44 -0
- data/spec/lib/word_aligner/aligner_spec.rb +31 -0
- data/spec/lib/word_aligner/word_error_rate_spec.rb +28 -0
- data/spec/lib/word_aligner_spec.rb +15 -0
- data/spec/sample_data/grab_for_comparision.rb +51 -0
- data/spec/sample_data/regression/sentences.yml +647 -0
- data/spec/sample_data/source_data/enough.hypotheses.txt +1 -0
- data/spec/sample_data/source_data/enough.txt +1 -0
- data/spec/sample_data/source_data/exactly_data.hypotheses.txt +1 -0
- data/spec/sample_data/source_data/exactly_data.txt +1 -0
- data/spec/sample_data/source_data/hamlet.hypotheses.txt +2 -0
- data/spec/sample_data/source_data/hamlet.txt +2 -0
- data/spec/sample_data/source_data/that_might.hypotheses.txt +1 -0
- data/spec/sample_data/source_data/that_might.txt +1 -0
- data/spec/sample_data/word_align.pl +302 -0
- data/spec/spec_helper.rb +20 -0
- metadata +143 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 9a4c26333b5e0991c70d0e61ef5435268672a112
|
4
|
+
data.tar.gz: 2ec6efcb6606676f2e30bd26425bd21843278018
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 199f2c4f5288a60c907a9d842951e740ec712eeaca1a9371a74b96db92d2053107543809b107c23bd025f532f5a75add474f5ff7f12cd163d0c99cb728cb5ffd
|
7
|
+
data.tar.gz: 95ae32a917c4e2578bb6087973c0cf195eb4b41b56e771b37d1d138b336440163e8bee9e1ebf015bb48b9690831654ed08fcce8c962dcbae3f11d765f9f01d86
|
data/Gemfile
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
# Add dependencies required to use your gem here.
|
3
|
+
# Example:
|
4
|
+
# gem "activesupport", ">= 2.3.5"
|
5
|
+
|
6
|
+
# Add dependencies to develop your gem here.
|
7
|
+
# Include everything needed to run rake, tests, features, etc.
|
8
|
+
group :development do
|
9
|
+
gem "guard"
|
10
|
+
gem "guard-rspec"
|
11
|
+
gem "shoulda", ">= 0"
|
12
|
+
gem "rdoc", "~> 3.12"
|
13
|
+
gem "bundler", "~> 1.0"
|
14
|
+
gem "jeweler", "~> 1.8.7"
|
15
|
+
gem "unicode_utils"
|
16
|
+
gem "rspec-core"
|
17
|
+
gem "guard-rspec"
|
18
|
+
end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,115 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
activesupport (4.0.0)
|
5
|
+
i18n (~> 0.6, >= 0.6.4)
|
6
|
+
minitest (~> 4.2)
|
7
|
+
multi_json (~> 1.3)
|
8
|
+
thread_safe (~> 0.1)
|
9
|
+
tzinfo (~> 0.3.37)
|
10
|
+
addressable (2.3.5)
|
11
|
+
atomic (1.1.13)
|
12
|
+
builder (3.2.2)
|
13
|
+
coderay (1.0.9)
|
14
|
+
diff-lcs (1.2.4)
|
15
|
+
faraday (0.8.8)
|
16
|
+
multipart-post (~> 1.2.0)
|
17
|
+
ffi (1.9.0)
|
18
|
+
formatador (0.2.4)
|
19
|
+
git (1.2.6)
|
20
|
+
github_api (0.10.1)
|
21
|
+
addressable
|
22
|
+
faraday (~> 0.8.1)
|
23
|
+
hashie (>= 1.2)
|
24
|
+
multi_json (~> 1.4)
|
25
|
+
nokogiri (~> 1.5.2)
|
26
|
+
oauth2
|
27
|
+
guard (1.8.2)
|
28
|
+
formatador (>= 0.2.4)
|
29
|
+
listen (>= 1.0.0)
|
30
|
+
lumberjack (>= 1.0.2)
|
31
|
+
pry (>= 0.9.10)
|
32
|
+
thor (>= 0.14.6)
|
33
|
+
guard-rspec (3.0.3)
|
34
|
+
guard (>= 1.8)
|
35
|
+
rspec (~> 2.13)
|
36
|
+
hashie (2.0.5)
|
37
|
+
highline (1.6.19)
|
38
|
+
httpauth (0.2.0)
|
39
|
+
i18n (0.6.5)
|
40
|
+
jeweler (1.8.7)
|
41
|
+
builder
|
42
|
+
bundler (~> 1.0)
|
43
|
+
git (>= 1.2.5)
|
44
|
+
github_api (= 0.10.1)
|
45
|
+
highline (>= 1.6.15)
|
46
|
+
nokogiri (= 1.5.10)
|
47
|
+
rake
|
48
|
+
rdoc
|
49
|
+
json (1.8.0)
|
50
|
+
jwt (0.1.8)
|
51
|
+
multi_json (>= 1.5)
|
52
|
+
listen (1.3.1)
|
53
|
+
rb-fsevent (>= 0.9.3)
|
54
|
+
rb-inotify (>= 0.9)
|
55
|
+
rb-kqueue (>= 0.2)
|
56
|
+
lumberjack (1.0.4)
|
57
|
+
method_source (0.8.2)
|
58
|
+
minitest (4.7.5)
|
59
|
+
multi_json (1.8.0)
|
60
|
+
multi_xml (0.5.5)
|
61
|
+
multipart-post (1.2.0)
|
62
|
+
nokogiri (1.5.10)
|
63
|
+
oauth2 (0.9.2)
|
64
|
+
faraday (~> 0.8)
|
65
|
+
httpauth (~> 0.2)
|
66
|
+
jwt (~> 0.1.4)
|
67
|
+
multi_json (~> 1.0)
|
68
|
+
multi_xml (~> 0.5)
|
69
|
+
rack (~> 1.2)
|
70
|
+
pry (0.9.12.2)
|
71
|
+
coderay (~> 1.0.5)
|
72
|
+
method_source (~> 0.8)
|
73
|
+
slop (~> 3.4)
|
74
|
+
rack (1.5.2)
|
75
|
+
rake (10.1.0)
|
76
|
+
rb-fsevent (0.9.3)
|
77
|
+
rb-inotify (0.9.1)
|
78
|
+
ffi (>= 0.5.0)
|
79
|
+
rb-kqueue (0.2.0)
|
80
|
+
ffi (>= 0.5.0)
|
81
|
+
rdoc (3.12.2)
|
82
|
+
json (~> 1.4)
|
83
|
+
rspec (2.14.1)
|
84
|
+
rspec-core (~> 2.14.0)
|
85
|
+
rspec-expectations (~> 2.14.0)
|
86
|
+
rspec-mocks (~> 2.14.0)
|
87
|
+
rspec-core (2.14.5)
|
88
|
+
rspec-expectations (2.14.2)
|
89
|
+
diff-lcs (>= 1.1.3, < 2.0)
|
90
|
+
rspec-mocks (2.14.3)
|
91
|
+
shoulda (3.5.0)
|
92
|
+
shoulda-context (~> 1.0, >= 1.0.1)
|
93
|
+
shoulda-matchers (>= 1.4.1, < 3.0)
|
94
|
+
shoulda-context (1.1.5)
|
95
|
+
shoulda-matchers (2.3.0)
|
96
|
+
activesupport (>= 3.0.0)
|
97
|
+
slop (3.4.6)
|
98
|
+
thor (0.18.1)
|
99
|
+
thread_safe (0.1.2)
|
100
|
+
atomic
|
101
|
+
tzinfo (0.3.37)
|
102
|
+
unicode_utils (1.4.0)
|
103
|
+
|
104
|
+
PLATFORMS
|
105
|
+
ruby
|
106
|
+
|
107
|
+
DEPENDENCIES
|
108
|
+
bundler (~> 1.0)
|
109
|
+
guard
|
110
|
+
guard-rspec
|
111
|
+
jeweler (~> 1.8.7)
|
112
|
+
rdoc (~> 3.12)
|
113
|
+
rspec-core
|
114
|
+
shoulda
|
115
|
+
unicode_utils
|
data/Guardfile
ADDED
@@ -0,0 +1,8 @@
|
|
1
|
+
# A sample Guardfile
|
2
|
+
# More info at https://github.com/guard/guard#readme
|
3
|
+
|
4
|
+
guard :rspec, cli: '--color --format nested' do
|
5
|
+
watch(%r{^spec/.+_spec\.rb$})
|
6
|
+
watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
|
7
|
+
watch('spec/spec_helper.rb') { "spec" }
|
8
|
+
end
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2013 Lunatyq, skpvox
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
# word_aligner
|
2
|
+
|
3
|
+
word_aligner lets you compare string as word_align.pl script does in cmu-sphinx.
|
4
|
+
|
5
|
+
It calculates distance in terms of insertions, deletions and substitutions.
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your Gemfile:
|
10
|
+
|
11
|
+
```
|
12
|
+
gem 'word_aligner'
|
13
|
+
```
|
14
|
+
|
15
|
+
Then execute:
|
16
|
+
|
17
|
+
```
|
18
|
+
$ bundle
|
19
|
+
```
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
|
23
|
+
```
|
24
|
+
error_rate = WordAligner.align('example shown line', 'they shown line')
|
25
|
+
error_rate.aligned_transcription => "EXAMPLE shown line"
|
26
|
+
error_rate.aligned_hypothesis => "THEY shown line"
|
27
|
+
error_rate.words => 3
|
28
|
+
error_rate.insertions => 0
|
29
|
+
error_rate.deletions => 0
|
30
|
+
error_rate.substitutions => 1
|
31
|
+
|
32
|
+
error_rate.correct => 2
|
33
|
+
error_rate.errors => 1
|
34
|
+
error_rate.percent_correct => 66.0
|
35
|
+
error_rate.percent_error => 33.0
|
36
|
+
error_rate.percent_accuracy => 66.0
|
37
|
+
```
|
38
|
+
|
39
|
+
## Contributing
|
40
|
+
|
41
|
+
1. Fork it
|
42
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
43
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
44
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
45
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
12
|
+
require 'rake'
|
13
|
+
|
14
|
+
require 'jeweler'
|
15
|
+
Jeweler::Tasks.new do |gem|
|
16
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
+
gem.name = "word_aligner"
|
18
|
+
gem.homepage = "http://github.com/lunatyq/word_aligner"
|
19
|
+
gem.license = "MIT"
|
20
|
+
gem.summary = %Q{TODO: one-line summary of your gem}
|
21
|
+
gem.description = %Q{TODO: longer description of your gem}
|
22
|
+
gem.email = "maciej@szukio.pl"
|
23
|
+
gem.authors = ["Lunatyq"]
|
24
|
+
# dependencies defined in Gemfile
|
25
|
+
end
|
26
|
+
Jeweler::RubygemsDotOrgTasks.new
|
27
|
+
|
28
|
+
require 'rake/testtask'
|
29
|
+
Rake::TestTask.new(:test) do |test|
|
30
|
+
test.libs << 'lib' << 'test'
|
31
|
+
test.pattern = 'test/**/test_*.rb'
|
32
|
+
test.verbose = true
|
33
|
+
end
|
34
|
+
|
35
|
+
task :default => :test
|
36
|
+
|
37
|
+
require 'rdoc/task'
|
38
|
+
Rake::RDocTask.new do |rdoc|
|
39
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
40
|
+
|
41
|
+
rdoc.rdoc_dir = 'rdoc'
|
42
|
+
rdoc.title = "word_aligner #{version}"
|
43
|
+
rdoc.rdoc_files.include('README*')
|
44
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
45
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
data/lib/word_aligner.rb
ADDED
@@ -0,0 +1,170 @@
|
|
1
|
+
require 'unicode_utils/upcase'
|
2
|
+
|
3
|
+
module WordAligner
|
4
|
+
class Aligner
|
5
|
+
|
6
|
+
INSERTION = 1
|
7
|
+
DELETION = 2
|
8
|
+
MATCHING = 3
|
9
|
+
SUBSTITUTE = 4
|
10
|
+
|
11
|
+
attr_accessor :insertions, :substitutions, :deletions, :matching, :align_cost
|
12
|
+
attr_accessor :aligned_transcription, :aligned_hypothesis
|
13
|
+
|
14
|
+
attr_reader :transcription, :hypothesis
|
15
|
+
attr_reader :reference_tokens, :hypothese_tokens
|
16
|
+
|
17
|
+
attr_reader :align_matrix, :backtrace_matrix
|
18
|
+
|
19
|
+
def initialize(transcription, hypothesis)
|
20
|
+
@transcription = transcription
|
21
|
+
@hypothesis = hypothesis
|
22
|
+
|
23
|
+
@insertions = 0
|
24
|
+
@substitutions = 0
|
25
|
+
@deletions = 0
|
26
|
+
@matching = 0
|
27
|
+
|
28
|
+
@reference_tokens = tokenize(transcription)
|
29
|
+
@hypothese_tokens = tokenize(hypothesis)
|
30
|
+
end
|
31
|
+
|
32
|
+
def word_error_rate
|
33
|
+
align_sentences if align_matrix.nil?
|
34
|
+
|
35
|
+
WordErrorRate.new(result)
|
36
|
+
end
|
37
|
+
|
38
|
+
def self.align(original, hypothesis)
|
39
|
+
new(original, hypothesis).result
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def result
|
45
|
+
{
|
46
|
+
aligned_transcription: aligned_transcription,
|
47
|
+
aligned_hypothesis: aligned_hypothesis,
|
48
|
+
transcription: transcription,
|
49
|
+
hypothesis: hypothesis,
|
50
|
+
transcription_words: reference_tokens.size,
|
51
|
+
insertions: insertions,
|
52
|
+
deletions: deletions,
|
53
|
+
substitutions: substitutions,
|
54
|
+
matching: matching,
|
55
|
+
align_cost: align_cost
|
56
|
+
}
|
57
|
+
end
|
58
|
+
|
59
|
+
def align_sentences
|
60
|
+
initialize_matrices
|
61
|
+
|
62
|
+
align_tokens
|
63
|
+
|
64
|
+
generate_aligned_sentences
|
65
|
+
end
|
66
|
+
|
67
|
+
def generate_aligned_sentences
|
68
|
+
backtrace.each do |ref, hyp|
|
69
|
+
width = 0
|
70
|
+
|
71
|
+
if ref && hyp and (ref == hyp)
|
72
|
+
ref = ref.downcase
|
73
|
+
hyp = hyp.downcase
|
74
|
+
end
|
75
|
+
|
76
|
+
ref ||= '***'
|
77
|
+
hyp ||= '***'
|
78
|
+
|
79
|
+
width = [ref.size, hyp.size, width, 3].max
|
80
|
+
|
81
|
+
aligned_transcription << '%-*s ' % [width, ref]
|
82
|
+
aligned_hypothesis << '%-*s ' % [width, hyp]
|
83
|
+
end
|
84
|
+
|
85
|
+
aligned_transcription.strip!
|
86
|
+
aligned_hypothesis.strip!
|
87
|
+
end
|
88
|
+
|
89
|
+
def initialize_matrices
|
90
|
+
@align_matrix = []
|
91
|
+
@backtrace_matrix = []
|
92
|
+
|
93
|
+
align_matrix << (0..hypothese_tokens.size).to_a
|
94
|
+
|
95
|
+
backtrace_matrix << Array.new(hypothese_tokens.size+1, INSERTION)
|
96
|
+
|
97
|
+
(0..reference_tokens.size).each { |j| align_matrix[j] ||= []; align_matrix[j][0] = j }
|
98
|
+
(0..reference_tokens.size).each { |i| backtrace_matrix[i] ||= []; backtrace_matrix[i][0] = DELETION }
|
99
|
+
|
100
|
+
self.aligned_transcription = ''
|
101
|
+
self.aligned_hypothesis = ''
|
102
|
+
end
|
103
|
+
|
104
|
+
def align_tokens
|
105
|
+
(1..reference_tokens.size).each do |i|
|
106
|
+
(1..hypothese_tokens.size).each do |j|
|
107
|
+
cost = reference_tokens[i - 1] != hypothese_tokens[j - 1] ? 1 : 0
|
108
|
+
|
109
|
+
ins = align_matrix[i][j - 1] + 1
|
110
|
+
del = align_matrix[i - 1][j] + 1
|
111
|
+
subst = align_matrix[i - 1][j - 1] + cost
|
112
|
+
min = [ins,del,subst].min
|
113
|
+
|
114
|
+
align_matrix[i][j] = min
|
115
|
+
|
116
|
+
if min == subst
|
117
|
+
backtrace_matrix[i][j] = MATCHING + cost
|
118
|
+
elsif min == ins
|
119
|
+
backtrace_matrix[i][j] = INSERTION
|
120
|
+
elsif min == del
|
121
|
+
backtrace_matrix[i][j] = DELETION
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
self.align_cost = align_matrix[reference_tokens.size][hypothese_tokens.size]
|
127
|
+
end
|
128
|
+
|
129
|
+
def backtrace
|
130
|
+
i = reference_tokens.size
|
131
|
+
j = hypothese_tokens.size
|
132
|
+
|
133
|
+
alignment = []
|
134
|
+
|
135
|
+
while !(i == 0 and j == 0)
|
136
|
+
pointer = backtrace_matrix[i][j]
|
137
|
+
|
138
|
+
case pointer
|
139
|
+
when INSERTION
|
140
|
+
alignment.unshift [nil, hypothese_tokens[j - 1]]
|
141
|
+
self.insertions += 1
|
142
|
+
j -= 1
|
143
|
+
when DELETION
|
144
|
+
alignment.unshift [reference_tokens[i - 1], nil]
|
145
|
+
self.deletions += 1
|
146
|
+
i -= 1
|
147
|
+
when MATCHING
|
148
|
+
alignment.unshift [reference_tokens[i - 1], hypothese_tokens[j - 1]]
|
149
|
+
self.matching += 1
|
150
|
+
j -= 1
|
151
|
+
i -= 1
|
152
|
+
when SUBSTITUTE
|
153
|
+
alignment.unshift [reference_tokens[i - 1], hypothese_tokens[j - 1]]
|
154
|
+
self.substitutions += 1
|
155
|
+
j -= 1
|
156
|
+
i -= 1
|
157
|
+
else
|
158
|
+
break
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
alignment
|
163
|
+
end
|
164
|
+
|
165
|
+
def tokenize(word)
|
166
|
+
UnicodeUtils.upcase(word.rstrip).split(' ')
|
167
|
+
end
|
168
|
+
|
169
|
+
end
|
170
|
+
end
|