RubyGems - word_aligner - Versions diffs - 0.1.0 - Mend

word_aligner 0.1.0

Files changed (27) hide show

checksums.yaml +7 -0
data/Gemfile +18 -0
data/Gemfile.lock +115 -0
data/Guardfile +8 -0
data/LICENSE.txt +20 -0
data/README.md +45 -0
data/Rakefile +45 -0
data/VERSION +1 -0
data/lib/word_aligner.rb +11 -0
data/lib/word_aligner/aligner.rb +170 -0
data/lib/word_aligner/word_error_rate.rb +44 -0
data/spec/lib/word_aligner/aligner_spec.rb +31 -0
data/spec/lib/word_aligner/word_error_rate_spec.rb +28 -0
data/spec/lib/word_aligner_spec.rb +15 -0
data/spec/sample_data/grab_for_comparision.rb +51 -0
data/spec/sample_data/regression/sentences.yml +647 -0
data/spec/sample_data/source_data/enough.hypotheses.txt +1 -0
data/spec/sample_data/source_data/enough.txt +1 -0
data/spec/sample_data/source_data/exactly_data.hypotheses.txt +1 -0
data/spec/sample_data/source_data/exactly_data.txt +1 -0
data/spec/sample_data/source_data/hamlet.hypotheses.txt +2 -0
data/spec/sample_data/source_data/hamlet.txt +2 -0
data/spec/sample_data/source_data/that_might.hypotheses.txt +1 -0
data/spec/sample_data/source_data/that_might.txt +1 -0
data/spec/sample_data/word_align.pl +302 -0
data/spec/spec_helper.rb +20 -0
metadata +143 -0

data/spec/sample_data/source_data/enough.hypotheses.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ that should be enough and is exactly the data which we will use (enough1)

data/spec/sample_data/source_data/enough.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ that should be enough and it is exactly the data which we will use (rec1)

data/spec/sample_data/source_data/exactly_data.hypotheses.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ that should be enough and is exactly the data which we will use (ex1)

data/spec/sample_data/source_data/exactly_data.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ that should be enough and it is exactly the data which we will use (ex1)

data/spec/sample_data/source_data/hamlet.hypotheses.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ To see or not to see.(ToBe1)
2	+ This it there question.(Question1)

data/spec/sample_data/source_data/hamlet.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ To be or not to be.(ToBe1)
2	+ This is the question.(Question1)

data/spec/sample_data/source_data/that_might.hypotheses.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ what might we use it full to do this right way the night (ThatMight1)

data/spec/sample_data/source_data/that_might.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ that might be usefull to do this right way we might (x1)

data/spec/sample_data/word_align.pl ADDED Viewed

@@ -0,0 +1,302 @@
+#!/usr/bin/perl -w
+# word_align.pl - Calculate word error and accuracy for a recognition
+# hypothesis file vs. a reference transcription
+#
+# Written by David Huggins-Daines <dhuggins@cs.cmu.edu> for Speech
+# Recognition and Understanding 11-751, Carnegie Mellon University,
+# October 2004.
+use Data::Dumper;
+use strict;
+use Getopt::Long;
+use Pod::Usage;
+use vars qw($Verbose $IgnoreUttID);
+my ($help,%hyphash);
+GetOptions(
+	   'help|?' => \$help,
+	   'verbose|v' => \$Verbose,
+	   'ignore-uttid|i' => \$IgnoreUttID,
+	  ) or pod2usage(1);
+pod2usage(1) if $help;
+pod2usage(2) unless @ARGV == 2;
+my ($ref, $hyp) = @ARGV;
+open HYP, "<$hyp" or die "Failed to open $hyp: $!";
+while (defined(my $hyp_utt=<HYP>)){
+    my $hyp_uttid;
+    ($hyp_utt, $hyp_uttid) = s3_magic_norm($hyp_utt);
+    $hyphash{$hyp_uttid} = "$hyp_utt ($hyp_uttid)";
+}
+close HYP;
+open REF, "<$ref" or die "Failed to open $ref: $!";
+open HYP, "<$hyp" or die "Failed to open $hyp: $!";
+use constant INS => 1;
+use constant DEL => 2;
+use constant MATCH => 3;
+use constant SUBST => 4;
+use constant BIG_NUMBER => 1e50;
+my ($total_words, $total_match, $total_cost);
+my ($total_ins, $total_del, $total_subst);
+while (defined(my $ref_utt = <REF>)) {
+    my $hyp_utt;
+    my $ref_uttid;
+    my $hyp_uttid;
+    ($ref_utt,$ref_uttid)=s3_magic_norm($ref_utt);
+    if(defined $IgnoreUttID){
+	$hyp_utt = <HYP>;
+	die "UttID is ignored but file size mismatch between $ref and $hyp" unless defined($hyp_utt);
+    }else{
+	$hyp_utt=$hyphash{$ref_uttid};
+        die "UttID is not ignored but it could not found in any entries of the hypothesis file on line3 $. UTTID: $ref_uttid\n" unless defined($hyp_utt);
+    }
+    ($hyp_utt,$hyp_uttid)=s3_magic_norm($hyp_utt);
+    if(! defined $IgnoreUttID){
+	die "Utterance ID mismatch on line $.: $ref_uttid != $hyp_uttid"
+	    unless $ref_uttid eq $hyp_uttid;
+    }
+    # Split the text into an array of words
+    my @ref_words = split ' ', $ref_utt;
+    my @hyp_words = split ' ', $hyp_utt;
+    my (@align_matrix, @backtrace_matrix);
+    # Initialize the alignment and backtrace matrices
+    initialize(\@ref_words, \@hyp_words, \@align_matrix, \@backtrace_matrix);
+    # Do DP alignment maintaining backtrace pointers
+    my $cost = align(\@ref_words, \@hyp_words, \@align_matrix, \@backtrace_matrix);
+    # Find the backtrace
+    my ($alignment, $ins, $del, $subst, $match) = backtrace(\@ref_words, \@hyp_words,
+							    \@align_matrix, \@backtrace_matrix);
+    # Format the alignment nicely
+    my ($ref_align, $hyp_align) = ("", "");
+    foreach (@$alignment) {
+	my ($ref, $hyp) = @$_;
+	my $width = 0;
+	# Capitalize errors (they already are...), lowercase matches
+	if (defined($ref) and defined($hyp) and $ref eq $hyp) {
+	    $ref = lc $ref;
+	    $hyp = lc $hyp;
+	}
+	# Replace deletions with ***
+	foreach ($ref, $hyp) { $_ = "***" unless defined $_ };
+	# Find the width of this column
+	foreach ($ref, $hyp) { $width = length if length > $width };
+	$width = 3 if $width < 3; # Make it long enough for ***
+	# Space out the words and concatenate them to the output
+	$ref_align .= sprintf("%-*s ", $width, $ref);
+	$hyp_align .= sprintf("%-*s ", $width, $hyp);
+    }
+    print "$ref_align ($ref_uttid)\n$hyp_align ($hyp_uttid)\n";
+    # Print out the word error and accuracy rates
+    my $error = @ref_words == 0 ? 1 : $cost/@ref_words;
+    my $acc = @ref_words == 0 ? 0 : $match/@ref_words;
+    printf("Words: %d Correct: %d Errors: %d Percent correct = %.2f%% Error = %.2f%% Accuracy = %.2f%%\n",
+	   scalar(@ref_words), $match, $cost, $acc*100, $error*100, 100-$error*100);
+    print "Insertions: $ins Deletions: $del Substitutions: $subst\n";
+    $total_cost += $cost;
+    $total_match += $match;
+    $total_words += @ref_words;
+    $total_ins += $ins;
+    $total_del += $del;
+    $total_subst += $subst;
+}
+# Print out the total word error and accuracy rates
+my $error = $total_cost/$total_words;
+my $acc = $total_match/$total_words;
+printf("TOTAL Words: %d Correct: %d Errors: %d\nTOTAL Percent correct = %.2f%% Error = %.2f%% Accuracy = %.2f%%\n",
+       $total_words, $total_match, $total_cost, $acc*100, $error*100, 100-$error*100);
+print "TOTAL Insertions: $total_ins Deletions: $total_del Substitutions: $total_subst\n";
+# This function normalizes a line of a match file.
+sub s3_magic_norm{
+    my ($word)=@_;
+    # Remove line endings
+    $word =~ s/\s+$//;
+    # Normalize case
+    $word = uc $word;
+    # Remove filler words and context cues
+    $word =~ s/<[^>]+>//g;
+    $word =~ s/\+\+[^+]+\+\+//g;
+    $word =~ s/\+[^+]+\+//g;
+    # Remove alternative pronunciations
+    $word =~ s/\([1-9]\)//g;
+    # Remove class tags
+    $word =~ s/:\S+//g;
+    # This compute the uttid and remove it from a line.
+    $word =~ s/\(([^) ]+)[^)]*\)$// ;
+    # Split apart compound words and acronyms
+    $word =~ tr/_./  /;
+    return ($word,$1);
+}
+sub initialize {
+    my ($ref_words, $hyp_words, $align_matrix, $backtrace_matrix) = @_;
+    # All initial costs along the j axis are insertions
+    for (my $j = 0; $j <= @$hyp_words; ++$j) {
+	$$align_matrix[0][$j] = $j;
+    }
+    for (my $j = 0; $j <= @$hyp_words; ++$j) {
+	$$backtrace_matrix[0][$j] = INS;
+    }
+    # All initial costs along the i axis are deletions
+    for (my $i = 0; $i <= @$ref_words; ++$i) {
+	$$align_matrix[$i][0] = $i;
+    }
+    for (my $i = 0; $i <= @$ref_words; ++$i) {
+	$$backtrace_matrix[$i][0] = DEL;
+    }
+}
+sub align {
+    my ($ref_words, $hyp_words, $align_matrix, $backtrace_matrix) = @_;
+    for (my $i = 1; $i <= @$ref_words; ++$i) {
+	for (my $j = 1; $j <= @$hyp_words; ++$j) {
+	    # Find insertion, deletion, substitution scores
+	    my ($ins, $del, $subst);
+	    # Cost of a substitution (0 if they are equal)
+	    my $cost = $$ref_words[$i-1] ne $$hyp_words[$j-1];
+	    # Find insertion, deletion, substitution costs
+	    $ins = $$align_matrix[$i][$j-1] + 1;
+	    $del = $$align_matrix[$i-1][$j] + 1;
+	    $subst = $$align_matrix[$i-1][$j-1] + $cost;
+	    print "Costs at $i $j: INS $ins DEL $del SUBST $subst\n" if $Verbose;
+	    # Get the minimum one
+	    my $min = BIG_NUMBER;
+	    foreach ($ins, $del, $subst) {
+		if ($_ < $min) {
+		    $min = $_;
+		}
+	    }
+	    $$align_matrix[$i][$j] = $min;
+	    # If the costs are equal, prefer match or substitution
+	    # (keep the path diagonal).
+	    if ($min == $subst) {
+		print(($cost ? "SUBSTITUTION" : "MATCH"),
+		      "($$ref_words[$i-1] <=> $$hyp_words[$j-1])\n") if $Verbose;
+		$$backtrace_matrix[$i][$j] = MATCH+$cost;
+	    }
+	    elsif ($min == $ins) {
+		print "INSERTION (0 => $$hyp_words[$j-1])\n" if $Verbose;
+		$$backtrace_matrix[$i][$j] = INS;
+	    }
+	    elsif ($min == $del) {
+		print "DELETION ($$ref_words[$i-1] => 0)\n" if $Verbose;
+		$$backtrace_matrix[$i][$j] = DEL;
+	    }
+	}
+    }
+    return $$align_matrix[@$ref_words][@$hyp_words];
+}
+sub backtrace {
+    my ($ref_words, $hyp_words, $align_matrix, $backtrace_matrix) = @_;
+    # Backtrace to find number of ins/del/subst
+    my @alignment;
+    my $i = @$ref_words;
+    my $j = @$hyp_words;
+	#print Dumper($backtrace_matrix);
+    my ($inspen, $delpen, $substpen, $match) = (0,0,0,0);
+    while (!($i == 0 and $j == 0)) {
+	  my $pointer = $$backtrace_matrix[$i][$j];
+	  print "Cost at $i $j: $$align_matrix[$i][$j]\n"
+	      if $Verbose;
+	  if ($pointer == INS) {
+	      print "INSERTION (0 => $$hyp_words[$j-1])" if $Verbose;
+	      # Append the pair 0:hyp[j] to the front of the alignment
+	      unshift @alignment, [undef, $$hyp_words[$j-1]];
+	      ++$inspen;
+	      --$j;
+	      print " - moving to $i $j\n" if $Verbose;
+	  }
+	  elsif ($pointer == DEL) {
+	      print "DELETION ($$ref_words[$i-1] => 0)" if $Verbose;
+	      # Append the pair ref[i]:0 to the front of the alignment
+	      unshift @alignment, [$$ref_words[$i-1], undef];
+	      ++$delpen;
+	      --$i;
+	      print " - moving to $i $j\n" if $Verbose;
+	  }
+	  elsif ($pointer == MATCH) {
+	      print "MATCH ($$ref_words[$i-1] <=> $$hyp_words[$j-1])" if $Verbose;
+	      # Append the pair ref[i]:hyp[j] to the front of the alignment
+	      unshift @alignment, [$$ref_words[$i-1], $$hyp_words[$j-1]];
+	      ++$match;
+	      --$j;
+	      --$i;
+	      print " - moving to $i $j\n" if $Verbose;
+	  }
+	  elsif ($pointer == SUBST) {
+	      print "SUBSTITUTION ($$ref_words[$i-1] <=> $$hyp_words[$j-1])" if $Verbose;
+	      # Append the pair ref[i]:hyp[j] to the front of the alignment
+	      unshift @alignment, [$$ref_words[$i-1], $$hyp_words[$j-1]];
+	      ++$substpen;
+	      --$j;
+	      --$i;
+	      print " - moving to $i $j\n" if $Verbose;
+	  }
+	  else {
+	      last;
+	  }
+    }
+    return (\@alignment, $inspen, $delpen, $substpen, $match);
+}
+__END__
+=head1 NAME
+calculate_wer - Calculate Word Error Rate from a reference and hypothesis file
+=head1 SYNOPSIS
+ calculate_wer [options] reference_file hypothesis_file
+=head1 OPTIONS
+=over 8
+=item B<--help>, B<-?>
+Print a brief help message and exit.
+=item B<--verbose>, B<-v>
+Print out messages tracing the alignment algorithm.
+=cut

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,20 @@
+# This file was generated by the `rspec --init` command. Conventionally, all
+# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
+# Require this file using `require "spec_helper"` to ensure that it is only
+# loaded once.
+#
+# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
+require 'word_aligner'
+RSpec.configure do |config|
+  config.treat_symbols_as_metadata_keys_with_true_values = true
+  config.run_all_when_everything_filtered = true
+  config.filter_run :focus
+  # Run specs in random order to surface order dependencies. If you find an
+  # order dependency and want to debug it, you can fix the order by providing
+  # the seed, which is printed after each run.
+  #     --seed 1234
+  config.order = 'random'
+end

metadata ADDED Viewed

@@ -0,0 +1,143 @@
+--- !ruby/object:Gem::Specification
+name: word_aligner
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Maciej
+- Simon
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2013-09-10 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: unicode_utils
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: shoulda
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rdoc
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '3.12'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '3.12'
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.0'
+- !ruby/object:Gem::Dependency
+  name: jeweler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.8.7
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.8.7
+description: WordAligner allows you to compare two strings as CMU-Sphinx word-aling.pl
+  does
+email: maciej@szukio.pl
+executables: []
+extensions: []
+extra_rdoc_files:
+- LICENSE.txt
+- README.md
+files:
+- Gemfile
+- Gemfile.lock
+- Guardfile
+- LICENSE.txt
+- README.md
+- Rakefile
+- VERSION
+- lib/word_aligner.rb
+- lib/word_aligner/aligner.rb
+- lib/word_aligner/word_error_rate.rb
+- spec/lib/word_aligner/aligner_spec.rb
+- spec/lib/word_aligner/word_error_rate_spec.rb
+- spec/lib/word_aligner_spec.rb
+- spec/sample_data/grab_for_comparision.rb
+- spec/sample_data/regression/sentences.yml
+- spec/sample_data/source_data/enough.hypotheses.txt
+- spec/sample_data/source_data/enough.txt
+- spec/sample_data/source_data/exactly_data.hypotheses.txt
+- spec/sample_data/source_data/exactly_data.txt
+- spec/sample_data/source_data/hamlet.hypotheses.txt
+- spec/sample_data/source_data/hamlet.txt
+- spec/sample_data/source_data/that_might.hypotheses.txt
+- spec/sample_data/source_data/that_might.txt
+- spec/sample_data/word_align.pl
+- spec/spec_helper.rb
+homepage: http://github.com/lunatyq/word_aligner
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.1.1
+signing_key:
+specification_version: 4
+summary: CMU-sphinx like word-align.pl comparision
+test_files: []
+has_rdoc: