RubyGems - levenshtein - Versions diffs - 0.1.1 → 0.2.0 - Mend

levenshtein 0.1.1 → 0.2.0

Files changed (12) hide show

data/CHANGELOG +12 -0
data/README +7 -3
data/VERSION +1 -1
data/ext/levenshtein/extconf.rb +6 -1
data/ext/levenshtein/levenshtein_array.c +127 -0
data/ext/levenshtein/levenshtein_array_of_strings.c +125 -0
data/ext/levenshtein/levenshtein_fast.c +21 -0
data/ext/levenshtein/levenshtein_generic.c +129 -0
data/ext/levenshtein/{levenshtein_c.c → levenshtein_string.c} +31 -20
data/lib/levenshtein.rb +45 -36
data/test/test.rb +57 -23
metadata +12 -7

data/CHANGELOG CHANGED

@@ -1,3 +1,15 @@
+0.2.0 (11-07-2009)
+* Return 0 instead of 0.0 in case of empty strings.
+* Added specific support for arrays.
+* Added specific support for arrays of strings.
+* Added generic support for all (?) kind of sequences.
+* Moved a lot of code to the C world.
 0.1.1 (06-10-2008)
 * If one of the strings was both the begin and the end of the

data/README CHANGED

@@ -1,8 +1,12 @@
 The Levenshtein distance is a metric for measuring the amount of difference
 between two sequences (i.e., the so called edit distance). The Levenshtein
-distance between two strings is given by the minimum number of operations
-needed to transform one string into the other, where an operation is an
-insertion, deletion, or substitution of a single character.
+distance between two sequences is given by the minimum number of operations
+needed to transform one sequence into the other, where an operation is an
+insertion, deletion, or substitution of a single element.
+The two sequences can be two strings, two arrays, or two other objects.
+Strings, arrays and arrays of strings are handled with optimized (very fast) C
+code. All other sequences are handled with generic (fast) C code.
 More information about the Levenshtein distance algorithm:
 http://en.wikipedia.org/wiki/Levenshtein_distance .

data/VERSION CHANGED

	@@ -1 +1 @@
1	- 0.1.1
1	+ 0.2.0

data/ext/levenshtein/extconf.rb CHANGED

@@ -2,4 +2,9 @@ require "mkmf"
 dir_config("levenshtein")
-create_makefile("levenshtein/levenshtein_c")
+have_library("levenshtein_array")
+have_library("levenshtein_array_of_strings")
+have_library("levenshtein_generic")
+have_library("levenshtein_string")
+create_makefile("levenshtein/levenshtein_fast")

data/ext/levenshtein/levenshtein_array.c ADDED

@@ -0,0 +1,127 @@
+#include "ruby.h"
+VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
+  int	threshold;
+  int	l1, l2;
+  int	*prev_row, *curr_row;
+  int	col, row;
+  int	curr_row_min, result;
+  int	offset;
+  ID id_eql	= rb_intern("==");
+  /* Get the sizes of both arrays. */
+  l1	= RARRAY(rb_o1)->len;
+  l2	= RARRAY(rb_o2)->len;
+  /* Convert Ruby's threshold to C's threshold. */
+  if (!NIL_P(rb_threshold)) {
+    threshold	= FIX2INT(rb_threshold);
+  } else {
+    threshold	= -1;
+  }
+  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
+  offset	= 0;
+  while RTEST(rb_funcall(rb_ary_entry(rb_o1, offset), id_eql, 1, rb_ary_entry(rb_o2, offset))) {
+    offset++;
+  }
+  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
+  while ((l1-1 > offset) && (l2-1 > offset) && RTEST(rb_funcall(rb_ary_entry(rb_o1, l1-1), id_eql, 1, rb_ary_entry(rb_o2, l2-1)))) {
+    l1--;
+    l2--;
+  }
+  l1 -= offset;
+  l2 -= offset;
+  /* The Levenshtein algorithm itself. */
+  /*       s1=              */
+  /*       ERIK             */
+  /*                        */
+  /*      01234             */
+  /* s2=V 11234             */
+  /*    E 21234             */
+  /*    E 32234             */
+  /*    N 43334 <- prev_row */
+  /*    S 54444 <- curr_row */
+  /*    T 65555             */
+  /*    R 76566             */
+  /*    A 87667             */
+  /* Allocate memory for both rows */
+  prev_row	= ALLOC_N(int, l1+1);
+  curr_row	= ALLOC_N(int, l1+1);
+  if ((prev_row == NULL) || (curr_row == NULL)) {
+    rb_raise(rb_eNoMemError, "out of memory");
+  }
+  /* Initialize the current row. */
+  for (col=0; col<=l1; col++) {
+    curr_row[col]	= col;
+  }
+  for (row=1; row<=l2; row++) {
+    /* Copy the current row to the previous row. */
+    memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
+    /* Calculate the values of the current row. */
+    curr_row[0]		= row;
+    curr_row_min	= row;
+    for (col=1; col<=l1; col++) {
+      /* Equal (cost=0) or substitution (cost=1). */
+      curr_row[col]	= prev_row[col-1] + (RTEST(rb_funcall(rb_ary_entry(rb_o1, offset+col-1), id_eql, 1, rb_ary_entry(rb_o2, offset+row-1))) ? 0 : 1);
+      /* Insertion if it's cheaper than substitution. */
+      if (prev_row[col]+1 < curr_row[col]) {
+        curr_row[col] = prev_row[col]+1;
+      }
+      /* Deletion if it's cheaper than substitution. */
+      if (curr_row[col-1]+1 < curr_row[col]) {
+        curr_row[col] = curr_row[col-1]+1;
+      }
+      /* Keep track of the minimum value on this row. */
+      if (curr_row[col] < curr_row_min) {
+        curr_row_min	= curr_row[col];
+      }
+    }
+    /* Return nil as soon as we exceed the threshold. */
+    if (threshold > -1 && curr_row_min >= threshold) {
+      free(prev_row);
+      free(curr_row);
+      return Qnil;
+    }
+  }
+  /* The result is the last value on the last row. */
+  result	= curr_row[l1];
+  free(prev_row);
+  free(curr_row);
+  /* Return the Ruby version of the result. */
+  return INT2FIX(result);
+}

data/ext/levenshtein/levenshtein_array_of_strings.c ADDED

@@ -0,0 +1,125 @@
+#include "ruby.h"
+VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
+  int	threshold;
+  int	l1, l2;
+  int	*prev_row, *curr_row;
+  int	col, row;
+  int	curr_row_min, result;
+  int	offset;
+  /* Get the sizes of both arrays. */
+  l1	= RARRAY(rb_o1)->len;
+  l2	= RARRAY(rb_o2)->len;
+  /* Convert Ruby's threshold to C's threshold. */
+  if (!NIL_P(rb_threshold)) {
+    threshold	= FIX2INT(rb_threshold);
+  } else {
+    threshold	= -1;
+  }
+  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
+  offset	= 0;
+  while (rb_str_cmp(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)) == 0) {
+    offset++;
+  }
+  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
+  while ((l1-1 > offset) && (l2-1 > offset) && (rb_str_cmp(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)) == 0 )) {
+    l1--;
+    l2--;
+  }
+  l1 -= offset;
+  l2 -= offset;
+  /* The Levenshtein algorithm itself. */
+  /*       s1=              */
+  /*       ERIK             */
+  /*                        */
+  /*      01234             */
+  /* s2=V 11234             */
+  /*    E 21234             */
+  /*    E 32234             */
+  /*    N 43334 <- prev_row */
+  /*    S 54444 <- curr_row */
+  /*    T 65555             */
+  /*    R 76566             */
+  /*    A 87667             */
+  /* Allocate memory for both rows */
+  prev_row	= ALLOC_N(int, l1+1);
+  curr_row	= ALLOC_N(int, l1+1);
+  if ((prev_row == NULL) || (curr_row == NULL)) {
+    rb_raise(rb_eNoMemError, "out of memory");
+  }
+  /* Initialize the current row. */
+  for (col=0; col<=l1; col++) {
+    curr_row[col]	= col;
+  }
+  for (row=1; row<=l2; row++) {
+    /* Copy the current row to the previous row. */
+    memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
+    /* Calculate the values of the current row. */
+    curr_row[0]		= row;
+    curr_row_min	= row;
+    for (col=1; col<=l1; col++) {
+      /* Equal (cost=0) or substitution (cost=1). */
+      curr_row[col]	= prev_row[col-1] + ((rb_str_cmp(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1)) == 0) ? 0 : 1);
+      /* Insertion if it's cheaper than substitution. */
+      if (prev_row[col]+1 < curr_row[col]) {
+        curr_row[col] = prev_row[col]+1;
+      }
+      /* Deletion if it's cheaper than substitution. */
+      if (curr_row[col-1]+1 < curr_row[col]) {
+        curr_row[col] = curr_row[col-1]+1;
+      }
+      /* Keep track of the minimum value on this row. */
+      if (curr_row[col] < curr_row_min) {
+        curr_row_min	= curr_row[col];
+      }
+    }
+    /* Return nil as soon as we exceed the threshold. */
+    if (threshold > -1 && curr_row_min >= threshold) {
+      free(prev_row);
+      free(curr_row);
+      return Qnil;
+    }
+  }
+  /* The result is the last value on the last row. */
+  result	= curr_row[l1];
+  free(prev_row);
+  free(curr_row);
+  /* Return the Ruby version of the result. */
+  return INT2FIX(result);
+}

data/ext/levenshtein/levenshtein_fast.c ADDED

@@ -0,0 +1,21 @@
+#include "ruby.h"
+VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
+  if ((TYPE(rb_o1) == T_STRING) && (TYPE(rb_o2)) == T_STRING) {
+    return levenshtein_distance_string(self, rb_o1, rb_o2, rb_threshold);
+  } else if ((TYPE(rb_o1) == T_ARRAY) && (TYPE(rb_o2)) == T_ARRAY) {
+    if ((TYPE(rb_ary_entry(rb_o1, 0)) == T_STRING) && (TYPE(rb_ary_entry(rb_o2, 0))) == T_STRING) {
+      return levenshtein_distance_array_of_strings(self, rb_o1, rb_o2, rb_threshold);
+    } else {
+      return levenshtein_distance_array(self, rb_o1, rb_o2, rb_threshold);
+    }
+  } else {
+    return levenshtein_distance_generic(self, rb_o1, rb_o2, rb_threshold);
+  }
+}
+void Init_levenshtein_fast() {
+  VALUE mLevenshtein	= rb_define_module("Levenshtein");
+  rb_define_singleton_method(mLevenshtein, "levenshtein_distance_fast" , levenshtein_distance_fast, 3);
+}

data/ext/levenshtein/levenshtein_generic.c ADDED

@@ -0,0 +1,129 @@
+#include "ruby.h"
+VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
+  int	threshold;
+  int	l1, l2;
+  int	*prev_row, *curr_row;
+  int	col, row;
+  int	curr_row_min, result;
+  int	offset;
+  ID id_length	= rb_intern("length");
+  ID id_get	= rb_intern("[]");
+  ID id_equal	= rb_intern("==");
+  /* Get the sizes of both sequences. */
+  l1	= FIX2INT(rb_funcall(rb_o1, id_length, 0));
+  l2	= FIX2INT(rb_funcall(rb_o2, id_length, 0));
+  /* Convert Ruby's threshold to C's threshold. */
+  if (!NIL_P(rb_threshold)) {
+    threshold	= FIX2INT(rb_threshold);
+  } else {
+    threshold	= -1;
+  }
+  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
+  offset	= 0;
+  while RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(offset)))) {
+    offset++;
+  }
+  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
+  while ((l1-1 > offset) && (l2-1 > offset) && RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(l1-1)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(l2-1))))) {
+    l1--;
+    l2--;
+  }
+  l1 -= offset;
+  l2 -= offset;
+  /* The Levenshtein algorithm itself. */
+  /*       s1=              */
+  /*       ERIK             */
+  /*                        */
+  /*      01234             */
+  /* s2=V 11234             */
+  /*    E 21234             */
+  /*    E 32234             */
+  /*    N 43334 <- prev_row */
+  /*    S 54444 <- curr_row */
+  /*    T 65555             */
+  /*    R 76566             */
+  /*    A 87667             */
+  /* Allocate memory for both rows */
+  prev_row	= ALLOC_N(int, l1+1);
+  curr_row	= ALLOC_N(int, l1+1);
+  if ((prev_row == NULL) || (curr_row == NULL)) {
+    rb_raise(rb_eNoMemError, "out of memory");
+  }
+  /* Initialize the current row. */
+  for (col=0; col<=l1; col++) {
+    curr_row[col]	= col;
+  }
+  for (row=1; row<=l2; row++) {
+    /* Copy the current row to the previous row. */
+    memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
+    /* Calculate the values of the current row. */
+    curr_row[0]		= row;
+    curr_row_min	= row;
+    for (col=1; col<=l1; col++) {
+      /* Equal (cost=0) or substitution (cost=1). */
+      curr_row[col]	= prev_row[col-1] + (RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset+col-1)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(offset+row-1)))) ? 0 : 1);
+      /* Insertion if it's cheaper than substitution. */
+      if (prev_row[col]+1 < curr_row[col]) {
+        curr_row[col] = prev_row[col]+1;
+      }
+      /* Deletion if it's cheaper than substitution. */
+      if (curr_row[col-1]+1 < curr_row[col]) {
+        curr_row[col] = curr_row[col-1]+1;
+      }
+      /* Keep track of the minimum value on this row. */
+      if (curr_row[col] < curr_row_min) {
+        curr_row_min	= curr_row[col];
+      }
+    }
+    /* Return nil as soon as we exceed the threshold. */
+    if (threshold > -1 && curr_row_min >= threshold) {
+      free(prev_row);
+      free(curr_row);
+      return Qnil;
+    }
+  }
+  /* The result is the last value on the last row. */
+  result	= curr_row[l1];
+  free(prev_row);
+  free(curr_row);
+  /* Return the Ruby version of the result. */
+  return INT2FIX(result);
+}

data/ext/levenshtein/{levenshtein_c.c → levenshtein_string.c} RENAMED

@@ -1,25 +1,25 @@
 #include "ruby.h"
-static VALUE levenshtein_distance_part2(VALUE self, VALUE rb_s1, VALUE rb_s2, VALUE rb_threshold) {
-  VALUE	rb_s3;
+VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
   int	threshold;
-  int	l1, l2, l3;
-  char	*s1, *s2, *s3;
+  int	l1, l2;
   int	*prev_row, *curr_row;
   int	col, row;
   int	curr_row_min, result;
+  int	offset;
+  char	*s1, *s2;
   /* Convert Ruby's s1 to C's s1. */
-  rb_s1	= StringValue(rb_s1);
-  s1	= RSTRING(rb_s1)->ptr;
-  l1	= RSTRING(rb_s1)->len;
+  rb_o1	= StringValue(rb_o1);
+  s1	= RSTRING(rb_o1)->ptr;
+  l1	= RSTRING(rb_o1)->len;
   /* Convert Ruby's s2 to C's s2. */
-  rb_s2	= StringValue(rb_s2);
-  s2	= RSTRING(rb_s2)->ptr;
-  l2	= RSTRING(rb_s2)->len;
+  rb_o2	= StringValue(rb_o2);
+  s2	= RSTRING(rb_o2)->ptr;
+  l2	= RSTRING(rb_o2)->len;
   /* Convert Ruby's threshold to C's threshold. */
@@ -29,7 +29,24 @@ static VALUE levenshtein_distance_part2(VALUE self, VALUE rb_s1, VALUE rb_s2, VA
     threshold	= -1;
   }
-  /* The Levenshtein Algorithm itself. */
+  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
+  offset	= 0;
+  while (s1[offset] == s2[offset]) {
+    offset++;
+  }
+  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
+  while ((l1-1 > offset) && (l2-1 > offset) && (s1[l1-1] == s2[l2-1])) {
+    l1--;
+    l2--;
+  }
+  l1 -= offset;
+  l2 -= offset;
+  /* The Levenshtein algorithm itself. */
   /*       s1=              */
   /*       ERIK             */
@@ -43,7 +60,7 @@ static VALUE levenshtein_distance_part2(VALUE self, VALUE rb_s1, VALUE rb_s2, VA
   /*    T 65555             */
   /*    R 76566             */
   /*    A 87667             */
   /* Allocate memory for both rows */
   prev_row	= ALLOC_N(int, l1+1);
@@ -70,9 +87,9 @@ static VALUE levenshtein_distance_part2(VALUE self, VALUE rb_s1, VALUE rb_s2, VA
     curr_row_min	= row;
     for (col=1; col<=l1; col++) {
-      /* Equal (cost=0) or Substitution (cost=1). */
+      /* Equal (cost=0) or substitution (cost=1). */
-      curr_row[col]	= prev_row[col-1] + ((s1[col-1] == s2[row-1]) ? 0 : 1);
+      curr_row[col]	= prev_row[col-1] + ((s1[offset+col-1] == s2[offset+row-1]) ? 0 : 1);
       /* Insertion if it's cheaper than substitution. */
@@ -114,9 +131,3 @@ static VALUE levenshtein_distance_part2(VALUE self, VALUE rb_s1, VALUE rb_s2, VA
   return INT2FIX(result);
 }
-void Init_levenshtein_c() {
-  VALUE mLevenshtein	= rb_define_module("Levenshtein");
-  rb_define_singleton_method(mLevenshtein, "distance_part2_fast" , levenshtein_distance_part2, 3);
-}

data/lib/levenshtein.rb CHANGED

@@ -1,30 +1,34 @@
 begin
-  require "levenshtein/levenshtein_c"
+  require "levenshtein/levenshtein_fast"	# If compiled by RubyGems.
 rescue LoadError
   begin
-    require "levenshtein_c"
+    require "levenshtein_fast"			# If compiled by the build script.
   rescue LoadError
-    $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.distance_part2. Using the slow Ruby version instead."
+    $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.distance. Using the much slower Ruby version instead."
   end
 end
-# The Levenshtein distance is a metric for measuring the amount of difference
-# between two sequences (i.e., the so called edit distance). The Levenshtein
-# distance between two strings is given by the minimum number of operations
-# needed to transform one string into the other, where an operation is an
-# insertion, deletion, or substitution of a single character.
+# The Levenshtein distance is a metric for measuring the amount
+# of difference between two sequences (i.e., the so called edit
+# distance). The Levenshtein distance between two sequences is
+# given by the minimum number of operations needed to transform
+# one sequence into the other, where an operation is an
+# insertion, deletion, or substitution of a single element.
 #
 # More information about the Levenshtein distance algorithm:
 # http://en.wikipedia.org/wiki/Levenshtein_distance .
 module Levenshtein
-  # Returns the Levenshtein distance as a number between 0.0 and 1.0.
-  # It's basically the Levenshtein distance divided by the length of the longest string.
+  VERSION	= "0.2.0"
+  # Returns the Levenshtein distance as a number between 0.0 and
+  # 1.0. It's basically the Levenshtein distance divided by the
+  # length of the longest sequence.
   def self.normalized_distance(s1, s2, threshold=nil)
     s1, s2	= s2, s1	if s1.length > s2.length	# s1 is the short one; s2 is the long one.
-    if s2.empty?
+    if s2.length == 0
       0.0	# Since s1.length < s2.length, s1 must be empty as well.
     else
       if threshold
@@ -39,46 +43,49 @@ module Levenshtein
     end
   end
-  # Returns the Levenshtein distance between two byte strings.
+  # Returns the Levenshtein distance between two sequences.
+  #
+  # The two sequences can be two strings, two arrays, or two other
+  # objects. Strings, arrays and arrays of strings are handled with
+  # optimized (very fast) C code. All other sequences are handled
+  # with generic (fast) C code.
+  #
+  # The sequences should respond to :length and :[] and all objects
+  # in the sequences (as returned by []) should response to :==.
   def self.distance(s1, s2, threshold=nil)
     s1, s2	= s2, s1	if s1.length > s2.length	# s1 is the short one; s2 is the long one.
     # Handle some basic circumstances.
-    return 0.0		if s1 == s2
-    return s2.length	if s1.empty?
-    return nil		if threshold and (s2.length-s1.length) >= threshold
-    return nil		if threshold and (s1.scan(/./) - s2.scan(/./)).length >= threshold
-    return nil		if threshold and (s2.scan(/./) - s1.scan(/./)).length >= threshold
-    # Do the expensive calculation on a subset of the strings only, if possible.
+    return 0		if s1 == s2
+    return s2.length	if s1.length == 0
-    b	= 0
-    e1	= s1.length-1
-    e2	= s2.length-1
+    if threshold
+      return nil	if (s2.length-s1.length) >= threshold
-    while s1[b, 1] == s2[b, 1]
-      b += 1
-    end
+      a1, a2	= nil, nil
+      a1, a2	= s1, s2			if s1.respond_to?(:-) and s2.respond_to?(:-)
+      a1, a2	= s1.scan(/./), s2.scan(/./)	if s1.respond_to?(:scan) and s2.respond_to?(:scan)
-    while s1[e1, 1] == s2[e2, 1] and e1 > b and e2 > b
-      e1 -= 1
-      e2 -= 1
+      if a1 and a2
+        return nil	if (a1-a2).length >= threshold
+        return nil	if (a2-a1).length >= threshold
+      end
     end
-    distance_part2(s1[b..e1], s2[b..e2], threshold)
+    distance_fast_or_slow(s1, s2, threshold)
   end
-  def self.distance_part2(s1, s2, threshold)	# :nodoc:
-    if respond_to?(:distance_part2_fast)
-      distance_part2_fast(s1, s2, threshold)	# Implemented in C.
+  def self.distance_fast_or_slow(s1, s2, threshold)	# :nodoc:
+    if respond_to?(:levenshtein_distance_fast)
+      levenshtein_distance_fast(s1, s2, threshold)	# Implemented in C.
     else
-      distance_part2_slow(s1, s2, threshold)	# Implemented in Ruby.
+      levenshtein_distance_slow(s1, s2, threshold)	# Implemented in Ruby.
     end
   end
-  def self.distance_part2_slow(s1, s2, threshold)	# :nodoc:
+  def self.levenshtein_distance_slow(s1, s2, threshold)	# :nodoc:
     row	= (0..s1.length).to_a
     1.upto(s2.length) do |y|
@@ -89,8 +96,10 @@ module Levenshtein
         row[x]	= [prow[x]+1, row[x-1]+1, prow[x-1]+(s1[x-1]==s2[y-1] ? 0 : 1)].min
       end
-      # Stop analysing this string as soon as the best possible result for this string is bigger than the best result so far.
-      # (The minimum value in the next row will be equal to or greater than the minimum value in this row.)
+      # Stop analysing this sequence as soon as the best possible
+      # result for this sequence is bigger than the best result so far.
+      # (The minimum value in the next row will be equal to or greater
+      # than the minimum value in this row.)
       return nil	if threshold and row.min >= threshold
     end

data/test/test.rb CHANGED

@@ -1,7 +1,35 @@
 require "test/unit"
 require "levenshtein"
-class TestLevenshtein < Test::Unit::TestCase
+module Levenshtein
+  class TestSequence
+    def initialize(o)
+      @sequence	= o
+    end
+    def length
+      @sequence.length
+    end
+    def [](pos)
+      @sequence[pos]
+    end
+  end
+  class TestElement
+    attr_reader :object
+    def initialize(o)
+      @object	= o
+    end
+    def ==(other)
+      @object == other.object
+    end
+  end
+end
+class TestLevenshteinString < Test::Unit::TestCase
   def test_erik_veenstra
     assert_equal(7, Levenshtein.distance("erik", "veenstra"))
     assert_equal(7, Levenshtein.distance("veenstra", "erik"))
@@ -30,9 +58,11 @@ class TestLevenshtein < Test::Unit::TestCase
   def test_threshold
     assert_equal(3, Levenshtein.distance("foo", "foobar"))
+    assert_equal(3, Levenshtein.distance("foo", "foobar", 4))
     assert_equal(nil, Levenshtein.distance("foo", "foobar", 2))
     assert_in_delta(0.5, Levenshtein.normalized_distance("foo", "foobar"), 0.01)
+    assert_in_delta(0.5, Levenshtein.normalized_distance("foo", "foobar", 0.66), 0.01)
     assert_equal(nil, Levenshtein.normalized_distance("foo", "foobar", 0.30))
   end
@@ -45,47 +75,51 @@ class TestLevenshtein < Test::Unit::TestCase
     assert_in_delta(0.42, Levenshtein.normalized_distance("ab123cd", "abxyzcd"), 0.01)
     assert_in_delta(0.6, Levenshtein.normalized_distance("ab123", "abxyz"), 0.01)
     assert_in_delta(0.6, Levenshtein.normalized_distance("123cd", "xyzcd"), 0.01)
+    assert_in_delta(0.625, Levenshtein.normalized_distance("123cd123", "123"), 0.01)
   end
 end
-class TestLevenshteinPart2Slow < Test::Unit::TestCase
+class TestLevenshteinArray < Test::Unit::TestCase
   def test_erik_veenstra
-    assert_equal(7, Levenshtein.distance_part2_slow("erik", "veenstra", nil))
-  end
+    x	= lambda{|s| s.scan(/./).collect{|e| Levenshtein::TestElement.new(e)}}
-  def test_empty_string
-    assert_equal(0, Levenshtein.distance_part2_slow("", "", nil))
-    assert_equal(3, Levenshtein.distance_part2_slow("", "foo", nil))
+    assert_equal(7, Levenshtein.distance(x["erik"], x["veenstra"]))
   end
+end
-  def test_same_string
-    assert_equal(0, Levenshtein.distance_part2_slow("", "", nil))
-    assert_equal(0, Levenshtein.distance_part2_slow("foo", "foo", nil))
+class TestLevenshteinArrayOfStrings < Test::Unit::TestCase
+  def test_erik_veenstra
+    x	= lambda{|s| s.scan(/./)}
+    assert_equal(7, Levenshtein.distance(x["erik"], x["veenstra"]))
   end
+end
-  def test_threshold
-    assert_equal(3, Levenshtein.distance_part2_slow("foo", "foobar", nil))
-    assert_equal(nil, Levenshtein.distance_part2_slow("foo", "foobar", 2))
+class TestLevenshteinGeneric < Test::Unit::TestCase
+  def test_erik_veenstra
+    x	= lambda{|s| Levenshtein::TestSequence.new(s.scan(/./).collect{|e| Levenshtein::TestElement.new(e)})}
+    assert_equal(7, Levenshtein.distance(x["erik"], x["veenstra"]))
   end
 end
-class TestLevenshteinPart2Fast < Test::Unit::TestCase
+class TestLevenshteinSlow < Test::Unit::TestCase
   def test_erik_veenstra
-    assert_equal(7, Levenshtein.distance_part2_fast("erik", "veenstra", nil))
+    assert_equal(7, Levenshtein.levenshtein_distance_slow("erik", "veenstra", nil))
   end
-  def test_empty_string
-    assert_equal(0, Levenshtein.distance_part2_fast("", "", nil))
-    assert_equal(3, Levenshtein.distance_part2_fast("", "foo", nil))
+  def test_empty_sequence
+    assert_equal(0, Levenshtein.levenshtein_distance_slow("", "", nil))
+    assert_equal(3, Levenshtein.levenshtein_distance_slow("", "foo", nil))
   end
-  def test_same_string
-    assert_equal(0, Levenshtein.distance_part2_fast("", "", nil))
-    assert_equal(0, Levenshtein.distance_part2_fast("foo", "foo", nil))
+  def test_same_sequence
+    assert_equal(0, Levenshtein.levenshtein_distance_slow("", "", nil))
+    assert_equal(0, Levenshtein.levenshtein_distance_slow("foo", "foo", nil))
   end
   def test_threshold
-    assert_equal(3, Levenshtein.distance_part2_fast("foo", "foobar", nil))
-    assert_equal(nil, Levenshtein.distance_part2_fast("foo", "foobar", 2))
+    assert_equal(3, Levenshtein.levenshtein_distance_slow("foo", "foobar", nil))
+    assert_equal(nil, Levenshtein.levenshtein_distance_slow("foo", "foobar", 2))
   end
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: levenshtein
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.2.0
 platform: ruby
 authors:
 - Erik Veenstra
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2008-10-06 00:00:00 +02:00
+date: 2009-07-11 00:00:00 +02:00
 default_executable:
 dependencies: []
@@ -23,15 +23,20 @@ extra_rdoc_files: []
 files:
 - lib/levenshtein.rb
-- ext/levenshtein
 - ext/levenshtein/extconf.rb
-- ext/levenshtein/levenshtein_c.c
+- ext/levenshtein/levenshtein_array_of_strings.c
+- ext/levenshtein/levenshtein_fast.c
+- ext/levenshtein/levenshtein_string.c
+- ext/levenshtein/levenshtein_generic.c
+- ext/levenshtein/levenshtein_array.c
 - README
 - LICENSE
 - VERSION
 - CHANGELOG
 has_rdoc: true
 homepage: http://www.erikveen.dds.nl/levenshtein/index.html
+licenses: []
 post_install_message:
 rdoc_options:
 - README
@@ -39,7 +44,7 @@ rdoc_options:
 - VERSION
 - CHANGELOG
 - --title
-- levenshtein (0.1.1)
+- levenshtein (0.2.0)
 - --main
 - README
 require_paths:
@@ -59,9 +64,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project: levenshtein
-rubygems_version: 1.2.0
+rubygems_version: 1.3.4
 signing_key:
-specification_version: 2
+specification_version: 3
 summary: Calculates the Levenshtein distance between two byte strings.
 test_files:
 - test/test.rb