word_aligner 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ that should be enough and is exactly the data which we will use (enough1)
@@ -0,0 +1 @@
1
+ that should be enough and it is exactly the data which we will use (rec1)
@@ -0,0 +1 @@
1
+ that should be enough and is exactly the data which we will use (ex1)
@@ -0,0 +1 @@
1
+ that should be enough and it is exactly the data which we will use (ex1)
@@ -0,0 +1,2 @@
1
+ To see or not to see.(ToBe1)
2
+ This it there question.(Question1)
@@ -0,0 +1,2 @@
1
+ To be or not to be.(ToBe1)
2
+ This is the question.(Question1)
@@ -0,0 +1 @@
1
+ what might we use it full to do this right way the night (ThatMight1)
@@ -0,0 +1 @@
1
+ that might be usefull to do this right way we might (x1)
@@ -0,0 +1,302 @@
1
+ #!/usr/bin/perl -w
2
+
3
+ # word_align.pl - Calculate word error and accuracy for a recognition
4
+ # hypothesis file vs. a reference transcription
5
+ #
6
+ # Written by David Huggins-Daines <dhuggins@cs.cmu.edu> for Speech
7
+ # Recognition and Understanding 11-751, Carnegie Mellon University,
8
+ # October 2004.
9
+
10
+ use Data::Dumper;
11
+
12
+ use strict;
13
+ use Getopt::Long;
14
+ use Pod::Usage;
15
+ use vars qw($Verbose $IgnoreUttID);
16
+
17
+ my ($help,%hyphash);
18
+ GetOptions(
19
+ 'help|?' => \$help,
20
+ 'verbose|v' => \$Verbose,
21
+ 'ignore-uttid|i' => \$IgnoreUttID,
22
+ ) or pod2usage(1);
23
+ pod2usage(1) if $help;
24
+
25
+ pod2usage(2) unless @ARGV == 2;
26
+ my ($ref, $hyp) = @ARGV;
27
+
28
+ open HYP, "<$hyp" or die "Failed to open $hyp: $!";
29
+ while (defined(my $hyp_utt=<HYP>)){
30
+ my $hyp_uttid;
31
+ ($hyp_utt, $hyp_uttid) = s3_magic_norm($hyp_utt);
32
+ $hyphash{$hyp_uttid} = "$hyp_utt ($hyp_uttid)";
33
+ }
34
+ close HYP;
35
+
36
+ open REF, "<$ref" or die "Failed to open $ref: $!";
37
+ open HYP, "<$hyp" or die "Failed to open $hyp: $!";
38
+
39
+ use constant INS => 1;
40
+ use constant DEL => 2;
41
+ use constant MATCH => 3;
42
+ use constant SUBST => 4;
43
+ use constant BIG_NUMBER => 1e50;
44
+
45
+ my ($total_words, $total_match, $total_cost);
46
+ my ($total_ins, $total_del, $total_subst);
47
+ while (defined(my $ref_utt = <REF>)) {
48
+ my $hyp_utt;
49
+ my $ref_uttid;
50
+ my $hyp_uttid;
51
+
52
+ ($ref_utt,$ref_uttid)=s3_magic_norm($ref_utt);
53
+
54
+
55
+ if(defined $IgnoreUttID){
56
+ $hyp_utt = <HYP>;
57
+ die "UttID is ignored but file size mismatch between $ref and $hyp" unless defined($hyp_utt);
58
+ }else{
59
+ $hyp_utt=$hyphash{$ref_uttid};
60
+ die "UttID is not ignored but it could not found in any entries of the hypothesis file on line3 $. UTTID: $ref_uttid\n" unless defined($hyp_utt);
61
+ }
62
+
63
+ ($hyp_utt,$hyp_uttid)=s3_magic_norm($hyp_utt);
64
+
65
+ if(! defined $IgnoreUttID){
66
+ die "Utterance ID mismatch on line $.: $ref_uttid != $hyp_uttid"
67
+ unless $ref_uttid eq $hyp_uttid;
68
+ }
69
+
70
+ # Split the text into an array of words
71
+ my @ref_words = split ' ', $ref_utt;
72
+ my @hyp_words = split ' ', $hyp_utt;
73
+
74
+ my (@align_matrix, @backtrace_matrix);
75
+
76
+ # Initialize the alignment and backtrace matrices
77
+ initialize(\@ref_words, \@hyp_words, \@align_matrix, \@backtrace_matrix);
78
+ # Do DP alignment maintaining backtrace pointers
79
+ my $cost = align(\@ref_words, \@hyp_words, \@align_matrix, \@backtrace_matrix);
80
+ # Find the backtrace
81
+ my ($alignment, $ins, $del, $subst, $match) = backtrace(\@ref_words, \@hyp_words,
82
+ \@align_matrix, \@backtrace_matrix);
83
+
84
+ # Format the alignment nicely
85
+ my ($ref_align, $hyp_align) = ("", "");
86
+ foreach (@$alignment) {
87
+ my ($ref, $hyp) = @$_;
88
+ my $width = 0;
89
+
90
+ # Capitalize errors (they already are...), lowercase matches
91
+ if (defined($ref) and defined($hyp) and $ref eq $hyp) {
92
+ $ref = lc $ref;
93
+ $hyp = lc $hyp;
94
+ }
95
+
96
+ # Replace deletions with ***
97
+ foreach ($ref, $hyp) { $_ = "***" unless defined $_ };
98
+
99
+ # Find the width of this column
100
+ foreach ($ref, $hyp) { $width = length if length > $width };
101
+ $width = 3 if $width < 3; # Make it long enough for ***
102
+
103
+ # Space out the words and concatenate them to the output
104
+ $ref_align .= sprintf("%-*s ", $width, $ref);
105
+ $hyp_align .= sprintf("%-*s ", $width, $hyp);
106
+ }
107
+ print "$ref_align ($ref_uttid)\n$hyp_align ($hyp_uttid)\n";
108
+
109
+ # Print out the word error and accuracy rates
110
+ my $error = @ref_words == 0 ? 1 : $cost/@ref_words;
111
+ my $acc = @ref_words == 0 ? 0 : $match/@ref_words;
112
+ printf("Words: %d Correct: %d Errors: %d Percent correct = %.2f%% Error = %.2f%% Accuracy = %.2f%%\n",
113
+ scalar(@ref_words), $match, $cost, $acc*100, $error*100, 100-$error*100);
114
+ print "Insertions: $ins Deletions: $del Substitutions: $subst\n";
115
+
116
+ $total_cost += $cost;
117
+ $total_match += $match;
118
+ $total_words += @ref_words;
119
+ $total_ins += $ins;
120
+ $total_del += $del;
121
+ $total_subst += $subst;
122
+ }
123
+ # Print out the total word error and accuracy rates
124
+ my $error = $total_cost/$total_words;
125
+ my $acc = $total_match/$total_words;
126
+ printf("TOTAL Words: %d Correct: %d Errors: %d\nTOTAL Percent correct = %.2f%% Error = %.2f%% Accuracy = %.2f%%\n",
127
+ $total_words, $total_match, $total_cost, $acc*100, $error*100, 100-$error*100);
128
+ print "TOTAL Insertions: $total_ins Deletions: $total_del Substitutions: $total_subst\n";
129
+
130
+ # This function normalizes a line of a match file.
131
+ sub s3_magic_norm{
132
+ my ($word)=@_;
133
+
134
+ # Remove line endings
135
+ $word =~ s/\s+$//;
136
+ # Normalize case
137
+ $word = uc $word;
138
+ # Remove filler words and context cues
139
+ $word =~ s/<[^>]+>//g;
140
+ $word =~ s/\+\+[^+]+\+\+//g;
141
+ $word =~ s/\+[^+]+\+//g;
142
+
143
+ # Remove alternative pronunciations
144
+ $word =~ s/\([1-9]\)//g;
145
+
146
+ # Remove class tags
147
+ $word =~ s/:\S+//g;
148
+
149
+ # This compute the uttid and remove it from a line.
150
+ $word =~ s/\(([^) ]+)[^)]*\)$// ;
151
+
152
+ # Split apart compound words and acronyms
153
+ $word =~ tr/_./ /;
154
+
155
+ return ($word,$1);
156
+ }
157
+
158
+ sub initialize {
159
+ my ($ref_words, $hyp_words, $align_matrix, $backtrace_matrix) = @_;
160
+
161
+ # All initial costs along the j axis are insertions
162
+ for (my $j = 0; $j <= @$hyp_words; ++$j) {
163
+ $$align_matrix[0][$j] = $j;
164
+ }
165
+ for (my $j = 0; $j <= @$hyp_words; ++$j) {
166
+ $$backtrace_matrix[0][$j] = INS;
167
+ }
168
+ # All initial costs along the i axis are deletions
169
+ for (my $i = 0; $i <= @$ref_words; ++$i) {
170
+ $$align_matrix[$i][0] = $i;
171
+ }
172
+ for (my $i = 0; $i <= @$ref_words; ++$i) {
173
+ $$backtrace_matrix[$i][0] = DEL;
174
+ }
175
+ }
176
+
177
+ sub align {
178
+ my ($ref_words, $hyp_words, $align_matrix, $backtrace_matrix) = @_;
179
+
180
+ for (my $i = 1; $i <= @$ref_words; ++$i) {
181
+ for (my $j = 1; $j <= @$hyp_words; ++$j) {
182
+ # Find insertion, deletion, substitution scores
183
+ my ($ins, $del, $subst);
184
+
185
+ # Cost of a substitution (0 if they are equal)
186
+ my $cost = $$ref_words[$i-1] ne $$hyp_words[$j-1];
187
+
188
+ # Find insertion, deletion, substitution costs
189
+ $ins = $$align_matrix[$i][$j-1] + 1;
190
+ $del = $$align_matrix[$i-1][$j] + 1;
191
+ $subst = $$align_matrix[$i-1][$j-1] + $cost;
192
+ print "Costs at $i $j: INS $ins DEL $del SUBST $subst\n" if $Verbose;
193
+
194
+ # Get the minimum one
195
+ my $min = BIG_NUMBER;
196
+ foreach ($ins, $del, $subst) {
197
+ if ($_ < $min) {
198
+ $min = $_;
199
+ }
200
+ }
201
+ $$align_matrix[$i][$j] = $min;
202
+
203
+ # If the costs are equal, prefer match or substitution
204
+ # (keep the path diagonal).
205
+ if ($min == $subst) {
206
+ print(($cost ? "SUBSTITUTION" : "MATCH"),
207
+ "($$ref_words[$i-1] <=> $$hyp_words[$j-1])\n") if $Verbose;
208
+ $$backtrace_matrix[$i][$j] = MATCH+$cost;
209
+ }
210
+ elsif ($min == $ins) {
211
+ print "INSERTION (0 => $$hyp_words[$j-1])\n" if $Verbose;
212
+ $$backtrace_matrix[$i][$j] = INS;
213
+ }
214
+ elsif ($min == $del) {
215
+ print "DELETION ($$ref_words[$i-1] => 0)\n" if $Verbose;
216
+ $$backtrace_matrix[$i][$j] = DEL;
217
+ }
218
+ }
219
+ }
220
+ return $$align_matrix[@$ref_words][@$hyp_words];
221
+ }
222
+
223
+ sub backtrace {
224
+ my ($ref_words, $hyp_words, $align_matrix, $backtrace_matrix) = @_;
225
+
226
+ # Backtrace to find number of ins/del/subst
227
+ my @alignment;
228
+ my $i = @$ref_words;
229
+ my $j = @$hyp_words;
230
+
231
+ #print Dumper($backtrace_matrix);
232
+
233
+ my ($inspen, $delpen, $substpen, $match) = (0,0,0,0);
234
+ while (!($i == 0 and $j == 0)) {
235
+ my $pointer = $$backtrace_matrix[$i][$j];
236
+ print "Cost at $i $j: $$align_matrix[$i][$j]\n"
237
+ if $Verbose;
238
+ if ($pointer == INS) {
239
+ print "INSERTION (0 => $$hyp_words[$j-1])" if $Verbose;
240
+ # Append the pair 0:hyp[j] to the front of the alignment
241
+ unshift @alignment, [undef, $$hyp_words[$j-1]];
242
+ ++$inspen;
243
+ --$j;
244
+ print " - moving to $i $j\n" if $Verbose;
245
+ }
246
+ elsif ($pointer == DEL) {
247
+ print "DELETION ($$ref_words[$i-1] => 0)" if $Verbose;
248
+ # Append the pair ref[i]:0 to the front of the alignment
249
+ unshift @alignment, [$$ref_words[$i-1], undef];
250
+ ++$delpen;
251
+ --$i;
252
+ print " - moving to $i $j\n" if $Verbose;
253
+ }
254
+ elsif ($pointer == MATCH) {
255
+ print "MATCH ($$ref_words[$i-1] <=> $$hyp_words[$j-1])" if $Verbose;
256
+ # Append the pair ref[i]:hyp[j] to the front of the alignment
257
+ unshift @alignment, [$$ref_words[$i-1], $$hyp_words[$j-1]];
258
+ ++$match;
259
+ --$j;
260
+ --$i;
261
+ print " - moving to $i $j\n" if $Verbose;
262
+ }
263
+ elsif ($pointer == SUBST) {
264
+ print "SUBSTITUTION ($$ref_words[$i-1] <=> $$hyp_words[$j-1])" if $Verbose;
265
+ # Append the pair ref[i]:hyp[j] to the front of the alignment
266
+ unshift @alignment, [$$ref_words[$i-1], $$hyp_words[$j-1]];
267
+ ++$substpen;
268
+ --$j;
269
+ --$i;
270
+ print " - moving to $i $j\n" if $Verbose;
271
+ }
272
+ else {
273
+ last;
274
+ }
275
+ }
276
+
277
+ return (\@alignment, $inspen, $delpen, $substpen, $match);
278
+ }
279
+
280
+ __END__
281
+
282
+ =head1 NAME
283
+
284
+ calculate_wer - Calculate Word Error Rate from a reference and hypothesis file
285
+
286
+ =head1 SYNOPSIS
287
+
288
+ calculate_wer [options] reference_file hypothesis_file
289
+
290
+ =head1 OPTIONS
291
+
292
+ =over 8
293
+
294
+ =item B<--help>, B<-?>
295
+
296
+ Print a brief help message and exit.
297
+
298
+ =item B<--verbose>, B<-v>
299
+
300
+ Print out messages tracing the alignment algorithm.
301
+
302
+ =cut
@@ -0,0 +1,20 @@
1
+ # This file was generated by the `rspec --init` command. Conventionally, all
2
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
3
+ # Require this file using `require "spec_helper"` to ensure that it is only
4
+ # loaded once.
5
+ #
6
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
7
+
8
+ require 'word_aligner'
9
+
10
+ RSpec.configure do |config|
11
+ config.treat_symbols_as_metadata_keys_with_true_values = true
12
+ config.run_all_when_everything_filtered = true
13
+ config.filter_run :focus
14
+
15
+ # Run specs in random order to surface order dependencies. If you find an
16
+ # order dependency and want to debug it, you can fix the order by providing
17
+ # the seed, which is printed after each run.
18
+ # --seed 1234
19
+ config.order = 'random'
20
+ end
metadata ADDED
@@ -0,0 +1,143 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: word_aligner
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Maciej
8
+ - Simon
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-09-10 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: unicode_utils
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - '>='
19
+ - !ruby/object:Gem::Version
20
+ version: '0'
21
+ type: :runtime
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - '>='
26
+ - !ruby/object:Gem::Version
27
+ version: '0'
28
+ - !ruby/object:Gem::Dependency
29
+ name: shoulda
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - '>='
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ type: :development
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - '>='
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ - !ruby/object:Gem::Dependency
43
+ name: rdoc
44
+ requirement: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ~>
47
+ - !ruby/object:Gem::Version
48
+ version: '3.12'
49
+ type: :development
50
+ prerelease: false
51
+ version_requirements: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ~>
54
+ - !ruby/object:Gem::Version
55
+ version: '3.12'
56
+ - !ruby/object:Gem::Dependency
57
+ name: bundler
58
+ requirement: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - ~>
61
+ - !ruby/object:Gem::Version
62
+ version: '1.0'
63
+ type: :development
64
+ prerelease: false
65
+ version_requirements: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ~>
68
+ - !ruby/object:Gem::Version
69
+ version: '1.0'
70
+ - !ruby/object:Gem::Dependency
71
+ name: jeweler
72
+ requirement: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - ~>
75
+ - !ruby/object:Gem::Version
76
+ version: 1.8.7
77
+ type: :development
78
+ prerelease: false
79
+ version_requirements: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ~>
82
+ - !ruby/object:Gem::Version
83
+ version: 1.8.7
84
+ description: WordAligner allows you to compare two strings as CMU-Sphinx word-aling.pl
85
+ does
86
+ email: maciej@szukio.pl
87
+ executables: []
88
+ extensions: []
89
+ extra_rdoc_files:
90
+ - LICENSE.txt
91
+ - README.md
92
+ files:
93
+ - Gemfile
94
+ - Gemfile.lock
95
+ - Guardfile
96
+ - LICENSE.txt
97
+ - README.md
98
+ - Rakefile
99
+ - VERSION
100
+ - lib/word_aligner.rb
101
+ - lib/word_aligner/aligner.rb
102
+ - lib/word_aligner/word_error_rate.rb
103
+ - spec/lib/word_aligner/aligner_spec.rb
104
+ - spec/lib/word_aligner/word_error_rate_spec.rb
105
+ - spec/lib/word_aligner_spec.rb
106
+ - spec/sample_data/grab_for_comparision.rb
107
+ - spec/sample_data/regression/sentences.yml
108
+ - spec/sample_data/source_data/enough.hypotheses.txt
109
+ - spec/sample_data/source_data/enough.txt
110
+ - spec/sample_data/source_data/exactly_data.hypotheses.txt
111
+ - spec/sample_data/source_data/exactly_data.txt
112
+ - spec/sample_data/source_data/hamlet.hypotheses.txt
113
+ - spec/sample_data/source_data/hamlet.txt
114
+ - spec/sample_data/source_data/that_might.hypotheses.txt
115
+ - spec/sample_data/source_data/that_might.txt
116
+ - spec/sample_data/word_align.pl
117
+ - spec/spec_helper.rb
118
+ homepage: http://github.com/lunatyq/word_aligner
119
+ licenses:
120
+ - MIT
121
+ metadata: {}
122
+ post_install_message:
123
+ rdoc_options: []
124
+ require_paths:
125
+ - lib
126
+ required_ruby_version: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - '>='
129
+ - !ruby/object:Gem::Version
130
+ version: '0'
131
+ required_rubygems_version: !ruby/object:Gem::Requirement
132
+ requirements:
133
+ - - '>='
134
+ - !ruby/object:Gem::Version
135
+ version: '0'
136
+ requirements: []
137
+ rubyforge_project:
138
+ rubygems_version: 2.1.1
139
+ signing_key:
140
+ specification_version: 4
141
+ summary: CMU-sphinx like word-align.pl comparision
142
+ test_files: []
143
+ has_rdoc: