RubyGems - levenshtein - Versions diffs - 0.2.0 → 0.2.1 - Mend

levenshtein 0.2.0 → 0.2.1

Files changed (13) hide show

data/CHANGELOG +8 -0
data/VERSION +1 -1
data/ext/levenshtein/levenshtein.h +13 -0
data/ext/levenshtein/levenshtein_array.c +24 -21
data/ext/levenshtein/levenshtein_array_of_strings.c +24 -19
data/ext/levenshtein/levenshtein_fast.c +3 -2
data/ext/levenshtein/levenshtein_generic.c +24 -20
data/ext/levenshtein/levenshtein_string.c +26 -21
data/lib/levenshtein.rb +61 -53
data/lib/levenshtein/exception.rb +4 -0
data/lib/levenshtein/version.rb +3 -0
data/test/test.rb +21 -9
metadata +25 -11

data/CHANGELOG CHANGED

@@ -1,3 +1,11 @@
+0.2.1 (11-02-2012)
+* Better memory handling.
+* Little speed improvements.
+* Ruby 1.9 compatible?
 0.2.0 (11-07-2009)
 * Return 0 instead of 0.0 in case of empty strings.

data/VERSION CHANGED

	@@ -1 +1 @@
1	- 0.2.0
1	+ 0.2.1

data/ext/levenshtein/levenshtein.h ADDED

@@ -0,0 +1,13 @@
+#ifdef RARRAY_PTR
+#else
+#define RARRAY_PTR(o) (RARRAY(o)->ptr)
+#define RARRAY_LEN(o) (RARRAY(o)->len)
+#endif
+#ifdef RSTRING_PTR
+#else
+#define RSTRING_PTR(o) (RSTRING(o)->ptr)
+#define RSTRING_LEN(o) (RSTRING(o)->len)
+#endif
+VALUE mLevenshtein;

data/ext/levenshtein/levenshtein_array.c CHANGED

@@ -1,19 +1,19 @@
 #include "ruby.h"
+#include "levenshtein.h"
 VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
   int	threshold;
   int	l1, l2;
-  int	*prev_row, *curr_row;
+  int	*prev_row, *curr_row, *temp_row;
   int	col, row;
   int	curr_row_min, result;
   int	offset;
-  ID id_eql	= rb_intern("==");
+  int	value1, value2;
   /* Get the sizes of both arrays. */
-  l1	= RARRAY(rb_o1)->len;
-  l2	= RARRAY(rb_o2)->len;
+  l1	= RARRAY_LEN(rb_o1);
+  l2	= RARRAY_LEN(rb_o2);
   /* Convert Ruby's threshold to C's threshold. */
@@ -26,13 +26,14 @@ VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_
   /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
   offset	= 0;
-  while RTEST(rb_funcall(rb_ary_entry(rb_o1, offset), id_eql, 1, rb_ary_entry(rb_o2, offset))) {
+  while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)))) {
     offset++;
   }
   /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
-  while ((l1-1 > offset) && (l2-1 > offset) && RTEST(rb_funcall(rb_ary_entry(rb_o1, l1-1), id_eql, 1, rb_ary_entry(rb_o2, l2-1)))) {
+  while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)))) {
     l1--;
     l2--;
   }
@@ -57,12 +58,8 @@ VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_
   /* Allocate memory for both rows */
-  prev_row	= ALLOC_N(int, l1+1);
-  curr_row	= ALLOC_N(int, l1+1);
-  if ((prev_row == NULL) || (curr_row == NULL)) {
-    rb_raise(rb_eNoMemError, "out of memory");
-  }
+  prev_row	= (int*) ALLOC_N(int, (l1+1));
+  curr_row	= (int*) ALLOC_N(int, (l1+1));
   /* Initialize the current row. */
@@ -73,7 +70,9 @@ VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_
   for (row=1; row<=l2; row++) {
     /* Copy the current row to the previous row. */
-    memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
+    temp_row	= prev_row;
+    prev_row	= curr_row;
+    curr_row	= temp_row;
     /* Calculate the values of the current row. */
@@ -83,25 +82,29 @@ VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_
     for (col=1; col<=l1; col++) {
       /* Equal (cost=0) or substitution (cost=1). */
-      curr_row[col]	= prev_row[col-1] + (RTEST(rb_funcall(rb_ary_entry(rb_o1, offset+col-1), id_eql, 1, rb_ary_entry(rb_o2, offset+row-1))) ? 0 : 1);
+      value1	= prev_row[col-1] + (RTEST(rb_equal(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1))) ? 0 : 1);
       /* Insertion if it's cheaper than substitution. */
-      if (prev_row[col]+1 < curr_row[col]) {
-        curr_row[col] = prev_row[col]+1;
+      value2	= prev_row[col]+1;
+      if (value2 < value1) {
+        value1	= value2;
       }
       /* Deletion if it's cheaper than substitution. */
-      if (curr_row[col-1]+1 < curr_row[col]) {
-        curr_row[col] = curr_row[col-1]+1;
+      value2	= curr_row[col-1]+1;
+      if (value2 < value1) {
+        value1	= value2;
       }
       /* Keep track of the minimum value on this row. */
-      if (curr_row[col] < curr_row_min) {
-        curr_row_min	= curr_row[col];
+      if (value1 < curr_row_min) {
+        curr_row_min	= value1;
       }
+      curr_row[col]	= value1;
     }
     /* Return nil as soon as we exceed the threshold. */

data/ext/levenshtein/levenshtein_array_of_strings.c CHANGED

@@ -1,17 +1,19 @@
 #include "ruby.h"
+#include "levenshtein.h"
 VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
   int	threshold;
   int	l1, l2;
-  int	*prev_row, *curr_row;
+  int	*prev_row, *curr_row, *temp_row;
   int	col, row;
   int	curr_row_min, result;
   int	offset;
+  int	value1, value2;
   /* Get the sizes of both arrays. */
-  l1	= RARRAY(rb_o1)->len;
-  l2	= RARRAY(rb_o2)->len;
+  l1	= RARRAY_LEN(rb_o1);
+  l2	= RARRAY_LEN(rb_o2);
   /* Convert Ruby's threshold to C's threshold. */
@@ -24,13 +26,14 @@ VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2
   /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
   offset	= 0;
-  while (rb_str_cmp(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)) == 0) {
+  while ((offset < l1) && (offset < l2) && (rb_str_cmp(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)) == 0)) {
     offset++;
   }
   /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
-  while ((l1-1 > offset) && (l2-1 > offset) && (rb_str_cmp(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)) == 0 )) {
+  while ((offset < l1) && (offset < l2) && (rb_str_cmp(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)) == 0 )) {
     l1--;
     l2--;
   }
@@ -55,12 +58,8 @@ VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2
   /* Allocate memory for both rows */
-  prev_row	= ALLOC_N(int, l1+1);
-  curr_row	= ALLOC_N(int, l1+1);
-  if ((prev_row == NULL) || (curr_row == NULL)) {
-    rb_raise(rb_eNoMemError, "out of memory");
-  }
+  prev_row	= (int*) ALLOC_N(int, (l1+1));
+  curr_row	= (int*) ALLOC_N(int, (l1+1));
   /* Initialize the current row. */
@@ -71,7 +70,9 @@ VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2
   for (row=1; row<=l2; row++) {
     /* Copy the current row to the previous row. */
-    memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
+    temp_row	= prev_row;
+    prev_row	= curr_row;
+    curr_row	= temp_row;
     /* Calculate the values of the current row. */
@@ -81,25 +82,29 @@ VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2
     for (col=1; col<=l1; col++) {
       /* Equal (cost=0) or substitution (cost=1). */
-      curr_row[col]	= prev_row[col-1] + ((rb_str_cmp(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1)) == 0) ? 0 : 1);
+      value1	= prev_row[col-1] + ((rb_str_cmp(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1)) == 0) ? 0 : 1);
       /* Insertion if it's cheaper than substitution. */
-      if (prev_row[col]+1 < curr_row[col]) {
-        curr_row[col] = prev_row[col]+1;
+      value2	= prev_row[col]+1;
+      if (value2 < value1) {
+        value1	= value2;
       }
       /* Deletion if it's cheaper than substitution. */
-      if (curr_row[col-1]+1 < curr_row[col]) {
-        curr_row[col] = curr_row[col-1]+1;
+      value2	= curr_row[col-1]+1;
+      if (value2 < value1) {
+        value1	= value2;
       }
       /* Keep track of the minimum value on this row. */
-      if (curr_row[col] < curr_row_min) {
-        curr_row_min	= curr_row[col];
+      if (value1 < curr_row_min) {
+        curr_row_min	= value1;
       }
+      curr_row[col]	= value1;
     }
     /* Return nil as soon as we exceed the threshold. */

data/ext/levenshtein/levenshtein_fast.c CHANGED

@@ -1,4 +1,5 @@
 #include "ruby.h"
+#include "levenshtein.h"
 VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
   if ((TYPE(rb_o1) == T_STRING) && (TYPE(rb_o2)) == T_STRING) {
@@ -15,7 +16,7 @@ VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_t
 }
 void Init_levenshtein_fast() {
-  VALUE mLevenshtein	= rb_define_module("Levenshtein");
+  mLevenshtein	= rb_const_get(rb_mKernel, rb_intern("Levenshtein"));
-  rb_define_singleton_method(mLevenshtein, "levenshtein_distance_fast" , levenshtein_distance_fast, 3);
+  rb_define_singleton_method(mLevenshtein, "distance_fast" , levenshtein_distance_fast, 3);
 }

data/ext/levenshtein/levenshtein_generic.c CHANGED

@@ -1,16 +1,17 @@
 #include "ruby.h"
+#include "levenshtein.h"
 VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
   int	threshold;
   int	l1, l2;
-  int	*prev_row, *curr_row;
+  int	*prev_row, *curr_row, *temp_row;
   int	col, row;
   int	curr_row_min, result;
   int	offset;
+  int	value1, value2;
-  ID id_length	= rb_intern("length");
-  ID id_get	= rb_intern("[]");
-  ID id_equal	= rb_intern("==");
+  ID	id_length	= rb_intern("length");
+  ID	id_get		= rb_intern("[]");
   /* Get the sizes of both sequences. */
@@ -28,13 +29,14 @@ VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE r
   /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
   offset	= 0;
-  while RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(offset)))) {
+  while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset)), rb_funcall(rb_o2, id_get, 1, INT2FIX(offset))))) {
     offset++;
   }
   /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
-  while ((l1-1 > offset) && (l2-1 > offset) && RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(l1-1)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(l2-1))))) {
+  while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_funcall(rb_o1, id_get, 1, INT2FIX(l1-1)), rb_funcall(rb_o2, id_get, 1, INT2FIX(l2-1))))) {
     l1--;
     l2--;
   }
@@ -59,12 +61,8 @@ VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE r
   /* Allocate memory for both rows */
-  prev_row	= ALLOC_N(int, l1+1);
-  curr_row	= ALLOC_N(int, l1+1);
-  if ((prev_row == NULL) || (curr_row == NULL)) {
-    rb_raise(rb_eNoMemError, "out of memory");
-  }
+  prev_row	= (int*) ALLOC_N(int, (l1+1));
+  curr_row	= (int*) ALLOC_N(int, (l1+1));
   /* Initialize the current row. */
@@ -75,7 +73,9 @@ VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE r
   for (row=1; row<=l2; row++) {
     /* Copy the current row to the previous row. */
-    memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
+    temp_row	= prev_row;
+    prev_row	= curr_row;
+    curr_row	= temp_row;
     /* Calculate the values of the current row. */
@@ -85,25 +85,29 @@ VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE r
     for (col=1; col<=l1; col++) {
       /* Equal (cost=0) or substitution (cost=1). */
-      curr_row[col]	= prev_row[col-1] + (RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset+col-1)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(offset+row-1)))) ? 0 : 1);
+      value1	= prev_row[col-1] + (RTEST(rb_equal(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset+col-1)), rb_funcall(rb_o2, id_get, 1, INT2FIX(offset+row-1)))) ? 0 : 1);
       /* Insertion if it's cheaper than substitution. */
-      if (prev_row[col]+1 < curr_row[col]) {
-        curr_row[col] = prev_row[col]+1;
+      value2	= prev_row[col]+1;
+      if (value2 < value1) {
+        value1	= value2;
       }
       /* Deletion if it's cheaper than substitution. */
-      if (curr_row[col-1]+1 < curr_row[col]) {
-        curr_row[col] = curr_row[col-1]+1;
+      value2	= curr_row[col-1]+1;
+      if (value2 < value1) {
+        value1	= value2;
       }
       /* Keep track of the minimum value on this row. */
-      if (curr_row[col] < curr_row_min) {
-        curr_row_min	= curr_row[col];
+      if (value1 < curr_row_min) {
+        curr_row_min	= value1;
       }
+      curr_row[col]	= value1;
     }
     /* Return nil as soon as we exceed the threshold. */

data/ext/levenshtein/levenshtein_string.c CHANGED

@@ -1,25 +1,27 @@
 #include "ruby.h"
+#include "levenshtein.h"
 VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
   int	threshold;
   int	l1, l2;
-  int	*prev_row, *curr_row;
+  int	*prev_row, *curr_row, *temp_row;
   int	col, row;
   int	curr_row_min, result;
   int	offset;
+  int	value1, value2;
   char	*s1, *s2;
   /* Convert Ruby's s1 to C's s1. */
   rb_o1	= StringValue(rb_o1);
-  s1	= RSTRING(rb_o1)->ptr;
-  l1	= RSTRING(rb_o1)->len;
+  s1	= RSTRING_PTR(rb_o1);
+  l1	= RSTRING_LEN(rb_o1);
   /* Convert Ruby's s2 to C's s2. */
   rb_o2	= StringValue(rb_o2);
-  s2	= RSTRING(rb_o2)->ptr;
-  l2	= RSTRING(rb_o2)->len;
+  s2	= RSTRING_PTR(rb_o2);
+  l2	= RSTRING_LEN(rb_o2);
   /* Convert Ruby's threshold to C's threshold. */
@@ -32,13 +34,14 @@ VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb
   /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
   offset	= 0;
-  while (s1[offset] == s2[offset]) {
+  while ((offset < l1) && (offset < l2) && (s1[offset] == s2[offset])) {
     offset++;
   }
   /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
-  while ((l1-1 > offset) && (l2-1 > offset) && (s1[l1-1] == s2[l2-1])) {
+  while ((offset < l1) && (offset < l2) && (s1[l1-1] == s2[l2-1])) {
     l1--;
     l2--;
   }
@@ -63,12 +66,8 @@ VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb
   /* Allocate memory for both rows */
-  prev_row	= ALLOC_N(int, l1+1);
-  curr_row	= ALLOC_N(int, l1+1);
-  if ((prev_row == NULL) || (curr_row == NULL)) {
-    rb_raise(rb_eNoMemError, "out of memory");
-  }
+  prev_row	= (int*) ALLOC_N(int, (l1+1));
+  curr_row	= (int*) ALLOC_N(int, (l1+1));
   /* Initialize the current row. */
@@ -79,7 +78,9 @@ VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb
   for (row=1; row<=l2; row++) {
     /* Copy the current row to the previous row. */
-    memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
+    temp_row	= prev_row;
+    prev_row	= curr_row;
+    curr_row	= temp_row;
     /* Calculate the values of the current row. */
@@ -89,25 +90,29 @@ VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb
     for (col=1; col<=l1; col++) {
       /* Equal (cost=0) or substitution (cost=1). */
-      curr_row[col]	= prev_row[col-1] + ((s1[offset+col-1] == s2[offset+row-1]) ? 0 : 1);
+      value1	= prev_row[col-1] + ((s1[offset+col-1] == s2[offset+row-1]) ? 0 : 1);
       /* Insertion if it's cheaper than substitution. */
-      if (prev_row[col]+1 < curr_row[col]) {
-        curr_row[col] = prev_row[col]+1;
+      value2	= prev_row[col]+1;
+      if (value2 < value1) {
+        value1	= value2;
       }
       /* Deletion if it's cheaper than substitution. */
-      if (curr_row[col-1]+1 < curr_row[col]) {
-        curr_row[col] = curr_row[col-1]+1;
+      value2	= curr_row[col-1]+1;
+      if (value2 < value1) {
+        value1	= value2;
       }
       /* Keep track of the minimum value on this row. */
-      if (curr_row[col] < curr_row_min) {
-        curr_row_min	= curr_row[col];
+      if (value1 < curr_row_min) {
+        curr_row_min	= value1;
       }
+      curr_row[col]	= value1;
     }
     /* Return nil as soon as we exceed the threshold. */

data/lib/levenshtein.rb CHANGED

@@ -1,44 +1,25 @@
-begin
-  require "levenshtein/levenshtein_fast"	# If compiled by RubyGems.
-rescue LoadError
-  begin
-    require "levenshtein_fast"			# If compiled by the build script.
-  rescue LoadError
-    $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.distance. Using the much slower Ruby version instead."
-  end
-end
-# The Levenshtein distance is a metric for measuring the amount
-# of difference between two sequences (i.e., the so called edit
-# distance). The Levenshtein distance between two sequences is
-# given by the minimum number of operations needed to transform
-# one sequence into the other, where an operation is an
-# insertion, deletion, or substitution of a single element.
-#
-# More information about the Levenshtein distance algorithm:
-# http://en.wikipedia.org/wiki/Levenshtein_distance .
+require "levenshtein/exception"
+require "levenshtein/version"
 module Levenshtein
-  VERSION	= "0.2.0"
   # Returns the Levenshtein distance as a number between 0.0 and
   # 1.0. It's basically the Levenshtein distance divided by the
   # length of the longest sequence.
-  def self.normalized_distance(s1, s2, threshold=nil)
-    s1, s2	= s2, s1	if s1.length > s2.length	# s1 is the short one; s2 is the long one.
+  def self.normalized_distance(a1, a2, threshold=nil)
+    a1, a2	= a2, a1	if a1.length > a2.length	# a1 is the short one; a2 is the long one.
-    if s2.length == 0
-      0.0	# Since s1.length < s2.length, s1 must be empty as well.
+    if a2.length == 0
+      0.0	# Since a1.length < a2.length, a1 must be empty as well.
     else
       if threshold
-        if d = self.distance(s1, s2, (threshold*s2.length+1).to_i)
-          d.to_f/s2.length
+        if d = self.distance(a1, a2, (threshold*a2.length+1).to_i)
+          d.to_f/a2.length
         else
           nil
         end
       else
-        self.distance(s1, s2).to_f/s2.length
+        self.distance(a1, a2).to_f/a2.length
       end
     end
   end
@@ -53,47 +34,64 @@ module Levenshtein
   # The sequences should respond to :length and :[] and all objects
   # in the sequences (as returned by []) should response to :==.
-  def self.distance(s1, s2, threshold=nil)
-    s1, s2	= s2, s1	if s1.length > s2.length	# s1 is the short one; s2 is the long one.
+  def self.distance(a1, a2, threshold=nil)
+    a1, a2	= a2, a1	if a1.length > a2.length	# a1 is the short one; a2 is the long one.
     # Handle some basic circumstances.
-    return 0		if s1 == s2
-    return s2.length	if s1.length == 0
+    return 0		if a1 == a2
+    return a2.length	if a1.length == 0
     if threshold
-      return nil	if (s2.length-s1.length) >= threshold
+      return nil	if (a2.length-a1.length) >= threshold
-      a1, a2	= nil, nil
-      a1, a2	= s1, s2			if s1.respond_to?(:-) and s2.respond_to?(:-)
-      a1, a2	= s1.scan(/./), s2.scan(/./)	if s1.respond_to?(:scan) and s2.respond_to?(:scan)
+      a3, a4	= nil, nil
+      a3, a4	= a1, a2			if a1.respond_to?(:-) and a2.respond_to?(:-)
+      a3, a4	= a1.scan(/./), a2.scan(/./)	if a1.respond_to?(:scan) and a2.respond_to?(:scan)
-      if a1 and a2
-        return nil	if (a1-a2).length >= threshold
-        return nil	if (a2-a1).length >= threshold
+      if a3 and a4
+        return nil	if (a3-a4).length >= threshold
+        return nil	if (a4-a3).length >= threshold
       end
     end
-    distance_fast_or_slow(s1, s2, threshold)
+    distance_fast_or_slow(a1, a2, threshold)
   end
-  def self.distance_fast_or_slow(s1, s2, threshold)	# :nodoc:
-    if respond_to?(:levenshtein_distance_fast)
-      levenshtein_distance_fast(s1, s2, threshold)	# Implemented in C.
+  def self.distance_fast_or_slow(a1, a2, threshold)	# :nodoc:
+    if respond_to?(:distance_fast)
+      distance_fast(a1, a2, threshold)	# Implemented in C.
     else
-      levenshtein_distance_slow(s1, s2, threshold)	# Implemented in Ruby.
+      distance_slow(a1, a2, threshold)	# Implemented in Ruby.
     end
   end
-  def self.levenshtein_distance_slow(s1, s2, threshold)	# :nodoc:
-    row	= (0..s1.length).to_a
+  def self.distance_slow(a1, a2, threshold)	# :nodoc:
+    l1	= a1.length
+    l2	= a2.length
-    1.upto(s2.length) do |y|
-      prow	= row
-      row	= [y]
+    offset	= 0
+    while offset < l1 and offset < l2 and a1[offset] == a2[offset]
+      offset += 1
+    end
+    while offset < l1 and offset < l2 and a1[l1-1] == a2[l2-1]
+      l1 -= 1
+      l2 -= 1
+    end
+    l1 -= offset
+    l2 -= offset
-      1.upto(s1.length) do |x|
-        row[x]	= [prow[x]+1, row[x-1]+1, prow[x-1]+(s1[x-1]==s2[y-1] ? 0 : 1)].min
+    crow	= (0..l1).to_a
+    1.upto(l2) do |y|
+      prow	= crow
+      crow	= [y]
+      1.upto(l1) do |x|
+        crow[x]	= [prow[x]+1, crow[x-1]+1, prow[x-1]+(a1[offset+x-1]==a2[offset+y-1] ? 0 : 1)].min
       end
       # Stop analysing this sequence as soon as the best possible
@@ -101,9 +99,19 @@ module Levenshtein
       # (The minimum value in the next row will be equal to or greater
       # than the minimum value in this row.)
-      return nil	if threshold and row.min >= threshold
+      return nil	if threshold and crow.min >= threshold
     end
-    row[-1]
+    crow[-1]
+  end
+end
+begin
+  require "levenshtein/levenshtein_fast"	# Compiled by RubyGems.
+rescue LoadError
+  begin
+    require "levenshtein_fast"			# Compiled by the build script.
+  rescue LoadError
+    $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein. Using the much slower Ruby version instead."
   end
 end

data/lib/levenshtein/exception.rb ADDED

@@ -0,0 +1,4 @@
+module Levenshtein
+  class LevenshteinException < RuntimeError
+  end
+end

data/lib/levenshtein/version.rb ADDED

@@ -0,0 +1,3 @@
+module Levenshtein
+  VERSION	= "0.2.1"
+end

data/test/test.rb CHANGED

@@ -12,6 +12,8 @@ module Levenshtein
     end
     def [](pos)
+      raise "type not allowed [#{pos.inspect}]"	unless pos.kind_of?(Fixnum)
       @sequence[pos]
     end
   end
@@ -105,21 +107,31 @@ end
 class TestLevenshteinSlow < Test::Unit::TestCase
   def test_erik_veenstra
-    assert_equal(7, Levenshtein.levenshtein_distance_slow("erik", "veenstra", nil))
+    assert_equal(7, Levenshtein.distance_slow("erik", "veenstra", nil))
+    assert_equal(7, Levenshtein.distance_slow("veenstra", "erik", nil))
   end
-  def test_empty_sequence
-    assert_equal(0, Levenshtein.levenshtein_distance_slow("", "", nil))
-    assert_equal(3, Levenshtein.levenshtein_distance_slow("", "foo", nil))
+  def test_empty_string
+    assert_equal(0, Levenshtein.distance_slow("", "", nil))
+    assert_equal(3, Levenshtein.distance_slow("", "foo", nil))
+    assert_equal(3, Levenshtein.distance_slow("foo", "", nil))
   end
-  def test_same_sequence
-    assert_equal(0, Levenshtein.levenshtein_distance_slow("", "", nil))
-    assert_equal(0, Levenshtein.levenshtein_distance_slow("foo", "foo", nil))
+  def test_same_string
+    assert_equal(0, Levenshtein.distance_slow("", "", nil))
+    assert_equal(0, Levenshtein.distance_slow("foo", "foo", nil))
   end
   def test_threshold
-    assert_equal(3, Levenshtein.levenshtein_distance_slow("foo", "foobar", nil))
-    assert_equal(nil, Levenshtein.levenshtein_distance_slow("foo", "foobar", 2))
+    assert_equal(3, Levenshtein.distance_slow("foo", "foobar", nil))
+    assert_equal(3, Levenshtein.distance_slow("foo", "foobar", 4))
+    assert_equal(nil, Levenshtein.distance_slow("foo", "foobar", 2))
+  end
+  def test_same_head_and_or_tail
+    assert_equal(3, Levenshtein.distance_slow("ab123cd", "abxyzcd", nil))
+    assert_equal(3, Levenshtein.distance_slow("ab123", "abxyz", nil))
+    assert_equal(3, Levenshtein.distance_slow("123cd", "xyzcd", nil))
+    assert_equal(5, Levenshtein.distance_slow("123cd123", "123", nil))
   end
 end

metadata CHANGED

@@ -1,7 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: levenshtein
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  hash: 21
+  prerelease:
+  segments:
+  - 0
+  - 2
+  - 1
+  version: 0.2.1
 platform: ruby
 authors:
 - Erik Veenstra
@@ -9,8 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-07-11 00:00:00 +02:00
-default_executable:
+date: 2012-02-11 00:00:00 Z
 dependencies: []
 description: Calculates the Levenshtein distance between two byte strings.
@@ -22,18 +27,21 @@ extensions:
 extra_rdoc_files: []
 files:
+- lib/levenshtein/exception.rb
+- lib/levenshtein/version.rb
 - lib/levenshtein.rb
-- ext/levenshtein/extconf.rb
-- ext/levenshtein/levenshtein_array_of_strings.c
-- ext/levenshtein/levenshtein_fast.c
 - ext/levenshtein/levenshtein_string.c
 - ext/levenshtein/levenshtein_generic.c
+- ext/levenshtein/levenshtein.h
+- ext/levenshtein/levenshtein_fast.c
+- ext/levenshtein/levenshtein_array_of_strings.c
 - ext/levenshtein/levenshtein_array.c
+- ext/levenshtein/extconf.rb
 - README
 - LICENSE
 - VERSION
 - CHANGELOG
-has_rdoc: true
+- test/test.rb
 homepage: http://www.erikveen.dds.nl/levenshtein/index.html
 licenses: []
@@ -44,27 +52,33 @@ rdoc_options:
 - VERSION
 - CHANGELOG
 - --title
-- levenshtein (0.2.0)
+- levenshtein (0.2.1)
 - --main
 - README
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
       version: "0"
-  version:
 required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
       version: "0"
-  version:
 requirements: []
 rubyforge_project: levenshtein
-rubygems_version: 1.3.4
+rubygems_version: 1.8.12
 signing_key:
 specification_version: 3
 summary: Calculates the Levenshtein distance between two byte strings.