word_aligner 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1 @@
1
+ that should be enough and is exactly the data which we will use (enough1)
@@ -0,0 +1 @@
1
+ that should be enough and it is exactly the data which we will use (rec1)
@@ -0,0 +1 @@
1
+ that should be enough and is exactly the data which we will use (ex1)
@@ -0,0 +1 @@
1
+ that should be enough and it is exactly the data which we will use (ex1)
@@ -0,0 +1,2 @@
1
+ To see or not to see.(ToBe1)
2
+ This it there question.(Question1)
@@ -0,0 +1,2 @@
1
+ To be or not to be.(ToBe1)
2
+ This is the question.(Question1)
@@ -0,0 +1 @@
1
+ what might we use it full to do this right way the night (ThatMight1)
@@ -0,0 +1 @@
1
+ that might be usefull to do this right way we might (x1)
@@ -0,0 +1,302 @@
1
+ #!/usr/bin/perl -w
2
+
3
+ # word_align.pl - Calculate word error and accuracy for a recognition
4
+ # hypothesis file vs. a reference transcription
5
+ #
6
+ # Written by David Huggins-Daines <dhuggins@cs.cmu.edu> for Speech
7
+ # Recognition and Understanding 11-751, Carnegie Mellon University,
8
+ # October 2004.
9
+
10
+ use Data::Dumper;
11
+
12
+ use strict;
13
+ use Getopt::Long;
14
+ use Pod::Usage;
15
+ use vars qw($Verbose $IgnoreUttID);
16
+
17
+ my ($help,%hyphash);
18
+ GetOptions(
19
+ 'help|?' => \$help,
20
+ 'verbose|v' => \$Verbose,
21
+ 'ignore-uttid|i' => \$IgnoreUttID,
22
+ ) or pod2usage(1);
23
+ pod2usage(1) if $help;
24
+
25
+ pod2usage(2) unless @ARGV == 2;
26
+ my ($ref, $hyp) = @ARGV;
27
+
28
+ open HYP, "<$hyp" or die "Failed to open $hyp: $!";
29
+ while (defined(my $hyp_utt=<HYP>)){
30
+ my $hyp_uttid;
31
+ ($hyp_utt, $hyp_uttid) = s3_magic_norm($hyp_utt);
32
+ $hyphash{$hyp_uttid} = "$hyp_utt ($hyp_uttid)";
33
+ }
34
+ close HYP;
35
+
36
+ open REF, "<$ref" or die "Failed to open $ref: $!";
37
+ open HYP, "<$hyp" or die "Failed to open $hyp: $!";
38
+
39
+ use constant INS => 1;
40
+ use constant DEL => 2;
41
+ use constant MATCH => 3;
42
+ use constant SUBST => 4;
43
+ use constant BIG_NUMBER => 1e50;
44
+
45
+ my ($total_words, $total_match, $total_cost);
46
+ my ($total_ins, $total_del, $total_subst);
47
+ while (defined(my $ref_utt = <REF>)) {
48
+ my $hyp_utt;
49
+ my $ref_uttid;
50
+ my $hyp_uttid;
51
+
52
+ ($ref_utt,$ref_uttid)=s3_magic_norm($ref_utt);
53
+
54
+
55
+ if(defined $IgnoreUttID){
56
+ $hyp_utt = <HYP>;
57
+ die "UttID is ignored but file size mismatch between $ref and $hyp" unless defined($hyp_utt);
58
+ }else{
59
+ $hyp_utt=$hyphash{$ref_uttid};
60
+ die "UttID is not ignored but it could not found in any entries of the hypothesis file on line3 $. UTTID: $ref_uttid\n" unless defined($hyp_utt);
61
+ }
62
+
63
+ ($hyp_utt,$hyp_uttid)=s3_magic_norm($hyp_utt);
64
+
65
+ if(! defined $IgnoreUttID){
66
+ die "Utterance ID mismatch on line $.: $ref_uttid != $hyp_uttid"
67
+ unless $ref_uttid eq $hyp_uttid;
68
+ }
69
+
70
+ # Split the text into an array of words
71
+ my @ref_words = split ' ', $ref_utt;
72
+ my @hyp_words = split ' ', $hyp_utt;
73
+
74
+ my (@align_matrix, @backtrace_matrix);
75
+
76
+ # Initialize the alignment and backtrace matrices
77
+ initialize(\@ref_words, \@hyp_words, \@align_matrix, \@backtrace_matrix);
78
+ # Do DP alignment maintaining backtrace pointers
79
+ my $cost = align(\@ref_words, \@hyp_words, \@align_matrix, \@backtrace_matrix);
80
+ # Find the backtrace
81
+ my ($alignment, $ins, $del, $subst, $match) = backtrace(\@ref_words, \@hyp_words,
82
+ \@align_matrix, \@backtrace_matrix);
83
+
84
+ # Format the alignment nicely
85
+ my ($ref_align, $hyp_align) = ("", "");
86
+ foreach (@$alignment) {
87
+ my ($ref, $hyp) = @$_;
88
+ my $width = 0;
89
+
90
+ # Capitalize errors (they already are...), lowercase matches
91
+ if (defined($ref) and defined($hyp) and $ref eq $hyp) {
92
+ $ref = lc $ref;
93
+ $hyp = lc $hyp;
94
+ }
95
+
96
+ # Replace deletions with ***
97
+ foreach ($ref, $hyp) { $_ = "***" unless defined $_ };
98
+
99
+ # Find the width of this column
100
+ foreach ($ref, $hyp) { $width = length if length > $width };
101
+ $width = 3 if $width < 3; # Make it long enough for ***
102
+
103
+ # Space out the words and concatenate them to the output
104
+ $ref_align .= sprintf("%-*s ", $width, $ref);
105
+ $hyp_align .= sprintf("%-*s ", $width, $hyp);
106
+ }
107
+ print "$ref_align ($ref_uttid)\n$hyp_align ($hyp_uttid)\n";
108
+
109
+ # Print out the word error and accuracy rates
110
+ my $error = @ref_words == 0 ? 1 : $cost/@ref_words;
111
+ my $acc = @ref_words == 0 ? 0 : $match/@ref_words;
112
+ printf("Words: %d Correct: %d Errors: %d Percent correct = %.2f%% Error = %.2f%% Accuracy = %.2f%%\n",
113
+ scalar(@ref_words), $match, $cost, $acc*100, $error*100, 100-$error*100);
114
+ print "Insertions: $ins Deletions: $del Substitutions: $subst\n";
115
+
116
+ $total_cost += $cost;
117
+ $total_match += $match;
118
+ $total_words += @ref_words;
119
+ $total_ins += $ins;
120
+ $total_del += $del;
121
+ $total_subst += $subst;
122
+ }
123
+ # Print out the total word error and accuracy rates
124
+ my $error = $total_cost/$total_words;
125
+ my $acc = $total_match/$total_words;
126
+ printf("TOTAL Words: %d Correct: %d Errors: %d\nTOTAL Percent correct = %.2f%% Error = %.2f%% Accuracy = %.2f%%\n",
127
+ $total_words, $total_match, $total_cost, $acc*100, $error*100, 100-$error*100);
128
+ print "TOTAL Insertions: $total_ins Deletions: $total_del Substitutions: $total_subst\n";
129
+
130
+ # This function normalizes a line of a match file.
131
+ sub s3_magic_norm{
132
+ my ($word)=@_;
133
+
134
+ # Remove line endings
135
+ $word =~ s/\s+$//;
136
+ # Normalize case
137
+ $word = uc $word;
138
+ # Remove filler words and context cues
139
+ $word =~ s/<[^>]+>//g;
140
+ $word =~ s/\+\+[^+]+\+\+//g;
141
+ $word =~ s/\+[^+]+\+//g;
142
+
143
+ # Remove alternative pronunciations
144
+ $word =~ s/\([1-9]\)//g;
145
+
146
+ # Remove class tags
147
+ $word =~ s/:\S+//g;
148
+
149
+ # This compute the uttid and remove it from a line.
150
+ $word =~ s/\(([^) ]+)[^)]*\)$// ;
151
+
152
+ # Split apart compound words and acronyms
153
+ $word =~ tr/_./ /;
154
+
155
+ return ($word,$1);
156
+ }
157
+
158
+ sub initialize {
159
+ my ($ref_words, $hyp_words, $align_matrix, $backtrace_matrix) = @_;
160
+
161
+ # All initial costs along the j axis are insertions
162
+ for (my $j = 0; $j <= @$hyp_words; ++$j) {
163
+ $$align_matrix[0][$j] = $j;
164
+ }
165
+ for (my $j = 0; $j <= @$hyp_words; ++$j) {
166
+ $$backtrace_matrix[0][$j] = INS;
167
+ }
168
+ # All initial costs along the i axis are deletions
169
+ for (my $i = 0; $i <= @$ref_words; ++$i) {
170
+ $$align_matrix[$i][0] = $i;
171
+ }
172
+ for (my $i = 0; $i <= @$ref_words; ++$i) {
173
+ $$backtrace_matrix[$i][0] = DEL;
174
+ }
175
+ }
176
+
177
+ sub align {
178
+ my ($ref_words, $hyp_words, $align_matrix, $backtrace_matrix) = @_;
179
+
180
+ for (my $i = 1; $i <= @$ref_words; ++$i) {
181
+ for (my $j = 1; $j <= @$hyp_words; ++$j) {
182
+ # Find insertion, deletion, substitution scores
183
+ my ($ins, $del, $subst);
184
+
185
+ # Cost of a substitution (0 if they are equal)
186
+ my $cost = $$ref_words[$i-1] ne $$hyp_words[$j-1];
187
+
188
+ # Find insertion, deletion, substitution costs
189
+ $ins = $$align_matrix[$i][$j-1] + 1;
190
+ $del = $$align_matrix[$i-1][$j] + 1;
191
+ $subst = $$align_matrix[$i-1][$j-1] + $cost;
192
+ print "Costs at $i $j: INS $ins DEL $del SUBST $subst\n" if $Verbose;
193
+
194
+ # Get the minimum one
195
+ my $min = BIG_NUMBER;
196
+ foreach ($ins, $del, $subst) {
197
+ if ($_ < $min) {
198
+ $min = $_;
199
+ }
200
+ }
201
+ $$align_matrix[$i][$j] = $min;
202
+
203
+ # If the costs are equal, prefer match or substitution
204
+ # (keep the path diagonal).
205
+ if ($min == $subst) {
206
+ print(($cost ? "SUBSTITUTION" : "MATCH"),
207
+ "($$ref_words[$i-1] <=> $$hyp_words[$j-1])\n") if $Verbose;
208
+ $$backtrace_matrix[$i][$j] = MATCH+$cost;
209
+ }
210
+ elsif ($min == $ins) {
211
+ print "INSERTION (0 => $$hyp_words[$j-1])\n" if $Verbose;
212
+ $$backtrace_matrix[$i][$j] = INS;
213
+ }
214
+ elsif ($min == $del) {
215
+ print "DELETION ($$ref_words[$i-1] => 0)\n" if $Verbose;
216
+ $$backtrace_matrix[$i][$j] = DEL;
217
+ }
218
+ }
219
+ }
220
+ return $$align_matrix[@$ref_words][@$hyp_words];
221
+ }
222
+
223
+ sub backtrace {
224
+ my ($ref_words, $hyp_words, $align_matrix, $backtrace_matrix) = @_;
225
+
226
+ # Backtrace to find number of ins/del/subst
227
+ my @alignment;
228
+ my $i = @$ref_words;
229
+ my $j = @$hyp_words;
230
+
231
+ #print Dumper($backtrace_matrix);
232
+
233
+ my ($inspen, $delpen, $substpen, $match) = (0,0,0,0);
234
+ while (!($i == 0 and $j == 0)) {
235
+ my $pointer = $$backtrace_matrix[$i][$j];
236
+ print "Cost at $i $j: $$align_matrix[$i][$j]\n"
237
+ if $Verbose;
238
+ if ($pointer == INS) {
239
+ print "INSERTION (0 => $$hyp_words[$j-1])" if $Verbose;
240
+ # Append the pair 0:hyp[j] to the front of the alignment
241
+ unshift @alignment, [undef, $$hyp_words[$j-1]];
242
+ ++$inspen;
243
+ --$j;
244
+ print " - moving to $i $j\n" if $Verbose;
245
+ }
246
+ elsif ($pointer == DEL) {
247
+ print "DELETION ($$ref_words[$i-1] => 0)" if $Verbose;
248
+ # Append the pair ref[i]:0 to the front of the alignment
249
+ unshift @alignment, [$$ref_words[$i-1], undef];
250
+ ++$delpen;
251
+ --$i;
252
+ print " - moving to $i $j\n" if $Verbose;
253
+ }
254
+ elsif ($pointer == MATCH) {
255
+ print "MATCH ($$ref_words[$i-1] <=> $$hyp_words[$j-1])" if $Verbose;
256
+ # Append the pair ref[i]:hyp[j] to the front of the alignment
257
+ unshift @alignment, [$$ref_words[$i-1], $$hyp_words[$j-1]];
258
+ ++$match;
259
+ --$j;
260
+ --$i;
261
+ print " - moving to $i $j\n" if $Verbose;
262
+ }
263
+ elsif ($pointer == SUBST) {
264
+ print "SUBSTITUTION ($$ref_words[$i-1] <=> $$hyp_words[$j-1])" if $Verbose;
265
+ # Append the pair ref[i]:hyp[j] to the front of the alignment
266
+ unshift @alignment, [$$ref_words[$i-1], $$hyp_words[$j-1]];
267
+ ++$substpen;
268
+ --$j;
269
+ --$i;
270
+ print " - moving to $i $j\n" if $Verbose;
271
+ }
272
+ else {
273
+ last;
274
+ }
275
+ }
276
+
277
+ return (\@alignment, $inspen, $delpen, $substpen, $match);
278
+ }
279
+
280
+ __END__
281
+
282
+ =head1 NAME
283
+
284
+ calculate_wer - Calculate Word Error Rate from a reference and hypothesis file
285
+
286
+ =head1 SYNOPSIS
287
+
288
+ calculate_wer [options] reference_file hypothesis_file
289
+
290
+ =head1 OPTIONS
291
+
292
+ =over 8
293
+
294
+ =item B<--help>, B<-?>
295
+
296
+ Print a brief help message and exit.
297
+
298
+ =item B<--verbose>, B<-v>
299
+
300
+ Print out messages tracing the alignment algorithm.
301
+
302
+ =cut
@@ -0,0 +1,20 @@
1
+ # This file was generated by the `rspec --init` command. Conventionally, all
2
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
3
+ # Require this file using `require "spec_helper"` to ensure that it is only
4
+ # loaded once.
5
+ #
6
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
7
+
8
+ require 'word_aligner'
9
+
10
+ RSpec.configure do |config|
11
+ config.treat_symbols_as_metadata_keys_with_true_values = true
12
+ config.run_all_when_everything_filtered = true
13
+ config.filter_run :focus
14
+
15
+ # Run specs in random order to surface order dependencies. If you find an
16
+ # order dependency and want to debug it, you can fix the order by providing
17
+ # the seed, which is printed after each run.
18
+ # --seed 1234
19
+ config.order = 'random'
20
+ end
metadata ADDED
@@ -0,0 +1,143 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: word_aligner
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Maciej
8
+ - Simon
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-09-10 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: unicode_utils
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - '>='
19
+ - !ruby/object:Gem::Version
20
+ version: '0'
21
+ type: :runtime
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - '>='
26
+ - !ruby/object:Gem::Version
27
+ version: '0'
28
+ - !ruby/object:Gem::Dependency
29
+ name: shoulda
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - '>='
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ type: :development
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - '>='
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ - !ruby/object:Gem::Dependency
43
+ name: rdoc
44
+ requirement: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ~>
47
+ - !ruby/object:Gem::Version
48
+ version: '3.12'
49
+ type: :development
50
+ prerelease: false
51
+ version_requirements: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ~>
54
+ - !ruby/object:Gem::Version
55
+ version: '3.12'
56
+ - !ruby/object:Gem::Dependency
57
+ name: bundler
58
+ requirement: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - ~>
61
+ - !ruby/object:Gem::Version
62
+ version: '1.0'
63
+ type: :development
64
+ prerelease: false
65
+ version_requirements: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ~>
68
+ - !ruby/object:Gem::Version
69
+ version: '1.0'
70
+ - !ruby/object:Gem::Dependency
71
+ name: jeweler
72
+ requirement: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - ~>
75
+ - !ruby/object:Gem::Version
76
+ version: 1.8.7
77
+ type: :development
78
+ prerelease: false
79
+ version_requirements: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ~>
82
+ - !ruby/object:Gem::Version
83
+ version: 1.8.7
84
+ description: WordAligner allows you to compare two strings as CMU-Sphinx word-aling.pl
85
+ does
86
+ email: maciej@szukio.pl
87
+ executables: []
88
+ extensions: []
89
+ extra_rdoc_files:
90
+ - LICENSE.txt
91
+ - README.md
92
+ files:
93
+ - Gemfile
94
+ - Gemfile.lock
95
+ - Guardfile
96
+ - LICENSE.txt
97
+ - README.md
98
+ - Rakefile
99
+ - VERSION
100
+ - lib/word_aligner.rb
101
+ - lib/word_aligner/aligner.rb
102
+ - lib/word_aligner/word_error_rate.rb
103
+ - spec/lib/word_aligner/aligner_spec.rb
104
+ - spec/lib/word_aligner/word_error_rate_spec.rb
105
+ - spec/lib/word_aligner_spec.rb
106
+ - spec/sample_data/grab_for_comparision.rb
107
+ - spec/sample_data/regression/sentences.yml
108
+ - spec/sample_data/source_data/enough.hypotheses.txt
109
+ - spec/sample_data/source_data/enough.txt
110
+ - spec/sample_data/source_data/exactly_data.hypotheses.txt
111
+ - spec/sample_data/source_data/exactly_data.txt
112
+ - spec/sample_data/source_data/hamlet.hypotheses.txt
113
+ - spec/sample_data/source_data/hamlet.txt
114
+ - spec/sample_data/source_data/that_might.hypotheses.txt
115
+ - spec/sample_data/source_data/that_might.txt
116
+ - spec/sample_data/word_align.pl
117
+ - spec/spec_helper.rb
118
+ homepage: http://github.com/lunatyq/word_aligner
119
+ licenses:
120
+ - MIT
121
+ metadata: {}
122
+ post_install_message:
123
+ rdoc_options: []
124
+ require_paths:
125
+ - lib
126
+ required_ruby_version: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - '>='
129
+ - !ruby/object:Gem::Version
130
+ version: '0'
131
+ required_rubygems_version: !ruby/object:Gem::Requirement
132
+ requirements:
133
+ - - '>='
134
+ - !ruby/object:Gem::Version
135
+ version: '0'
136
+ requirements: []
137
+ rubyforge_project:
138
+ rubygems_version: 2.1.1
139
+ signing_key:
140
+ specification_version: 4
141
+ summary: CMU-sphinx like word-align.pl comparision
142
+ test_files: []
143
+ has_rdoc: