RubyGems - levenshtein - Versions diffs - 0.2.0 → 0.2.1 - Mend

levenshtein 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

data/CHANGELOG +8 -0
data/VERSION +1 -1
data/ext/levenshtein/levenshtein.h +13 -0
data/ext/levenshtein/levenshtein_array.c +24 -21
data/ext/levenshtein/levenshtein_array_of_strings.c +24 -19
data/ext/levenshtein/levenshtein_fast.c +3 -2
data/ext/levenshtein/levenshtein_generic.c +24 -20
data/ext/levenshtein/levenshtein_string.c +26 -21
data/lib/levenshtein.rb +61 -53
data/lib/levenshtein/exception.rb +4 -0
data/lib/levenshtein/version.rb +3 -0
data/test/test.rb +21 -9
metadata +25 -11

data/CHANGELOG CHANGED

@@ -1,3 +1,11 @@
+0.2.1 (11-02-2012)
+* Better memory handling.
+* Little speed improvements.
+* Ruby 1.9 compatible?
 0.2.0 (11-07-2009)
 * Return 0 instead of 0.0 in case of empty strings.

data/VERSION CHANGED

	@@ -1 +1 @@
1	- 0.2.0
1	+ 0.2.1

data/ext/levenshtein/levenshtein.h ADDED

@@ -0,0 +1,13 @@
+#ifdef RARRAY_PTR
+#else
+#define RARRAY_PTR(o) (RARRAY(o)->ptr)
+#define RARRAY_LEN(o) (RARRAY(o)->len)
+#endif
+#ifdef RSTRING_PTR
+#else
+#define RSTRING_PTR(o) (RSTRING(o)->ptr)
+#define RSTRING_LEN(o) (RSTRING(o)->len)
+#endif
+VALUE mLevenshtein;

data/ext/levenshtein/levenshtein_array.c CHANGED

@@ -1,19 +1,19 @@
 #include "ruby.h"
+#include "levenshtein.h"
 VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
   int	threshold;
   int	l1, l2;
-  int	*prev_row, *curr_row;
+  int	*prev_row, *curr_row, *temp_row;
   int	col, row;
   int	curr_row_min, result;
   int	offset;
-  ID id_eql	= rb_intern("==");
+  int	value1, value2;
   /* Get the sizes of both arrays. */
-  l1	= RARRAY(rb_o1)->len;
-  l2	= RARRAY(rb_o2)->len;
+  l1	= RARRAY_LEN(rb_o1);
+  l2	= RARRAY_LEN(rb_o2);
   /* Convert Ruby's threshold to C's threshold. */
@@ -26,13 +26,14 @@ VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_
   /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
   offset	= 0;
-  while RTEST(rb_funcall(rb_ary_entry(rb_o1, offset), id_eql, 1, rb_ary_entry(rb_o2, offset))) {
+  while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)))) {
     offset++;
   }
   /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
-  while ((l1-1 > offset) && (l2-1 > offset) && RTEST(rb_funcall(rb_ary_entry(rb_o1, l1-1), id_eql, 1, rb_ary_entry(rb_o2, l2-1)))) {
+  while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)))) {
     l1--;
     l2--;
   }
@@ -57,12 +58,8 @@ VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_
   /* Allocate memory for both rows */
-  prev_row	= ALLOC_N(int, l1+1);
-  curr_row	= ALLOC_N(int, l1+1);
-  if ((prev_row == NULL) || (curr_row == NULL)) {
-    rb_raise(rb_eNoMemError, "out of memory");
-  }
+  prev_row	= (int*) ALLOC_N(int, (l1+1));
+  curr_row	= (int*) ALLOC_N(int, (l1+1));
   /* Initialize the current row. */
@@ -73,7 +70,9 @@ VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_
   for (row=1; row<=l2; row++) {
     /* Copy the current row to the previous row. */
-    memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
+    temp_row	= prev_row;
+    prev_row	= curr_row;
+    curr_row	= temp_row;
     /* Calculate the values of the current row. */
@@ -83,25 +82,29 @@ VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_
     for (col=1; col<=l1; col++) {
       /* Equal (cost=0) or substitution (cost=1). */
-      curr_row[col]	= prev_row[col-1] + (RTEST(rb_funcall(rb_ary_entry(rb_o1, offset+col-1), id_eql, 1, rb_ary_entry(rb_o2, offset+row-1))) ? 0 : 1);
+      value1	= prev_row[col-1] + (RTEST(rb_equal(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1))) ? 0 : 1);
       /* Insertion if it's cheaper than substitution. */
-      if (prev_row[col]+1 < curr_row[col]) {
-        curr_row[col] = prev_row[col]+1;
+      value2	= prev_row[col]+1;
+      if (value2 < value1) {
+        value1	= value2;
       }
       /* Deletion if it's cheaper than substitution. */
-      if (curr_row[col-1]+1 < curr_row[col]) {
-        curr_row[col] = curr_row[col-1]+1;
+      value2	= curr_row[col-1]+1;
+      if (value2 < value1) {
+        value1	= value2;
       }
       /* Keep track of the minimum value on this row. */
-      if (curr_row[col] < curr_row_min) {
-        curr_row_min	= curr_row[col];
+      if (value1 < curr_row_min) {
+        curr_row_min	= value1;
       }
+      curr_row[col]	= value1;
     }
     /* Return nil as soon as we exceed the threshold. */

data/ext/levenshtein/levenshtein_array_of_strings.c CHANGED

@@ -1,17 +1,19 @@
 #include "ruby.h"
+#include "levenshtein.h"
 VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
   int	threshold;
   int	l1, l2;
-  int	*prev_row, *curr_row;
+  int	*prev_row, *curr_row, *temp_row;
   int	col, row;
   int	curr_row_min, result;
   int	offset;
+  int	value1, value2;
   /* Get the sizes of both arrays. */
-  l1	= RARRAY(rb_o1)->len;
-  l2	= RARRAY(rb_o2)->len;
+  l1	= RARRAY_LEN(rb_o1);
+  l2	= RARRAY_LEN(rb_o2);
   /* Convert Ruby's threshold to C's threshold. */
@@ -24,13 +26,14 @@ VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2
   /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
   offset	= 0;
-  while (rb_str_cmp(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)) == 0) {
+  while ((offset < l1) && (offset < l2) && (rb_str_cmp(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)) == 0)) {
     offset++;
   }
   /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
-  while ((l1-1 > offset) && (l2-1 > offset) && (rb_str_cmp(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)) == 0 )) {
+  while ((offset < l1) && (offset < l2) && (rb_str_cmp(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)) == 0 )) {
     l1--;
     l2--;
   }
@@ -55,12 +58,8 @@ VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2
   /* Allocate memory for both rows */
-  prev_row	= ALLOC_N(int, l1+1);
-  curr_row	= ALLOC_N(int, l1+1);
-  if ((prev_row == NULL) || (curr_row == NULL)) {
-    rb_raise(rb_eNoMemError, "out of memory");
-  }
+  prev_row	= (int*) ALLOC_N(int, (l1+1));
+  curr_row	= (int*) ALLOC_N(int, (l1+1));
   /* Initialize the current row. */
@@ -71,7 +70,9 @@ VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2
   for (row=1; row<=l2; row++) {
     /* Copy the current row to the previous row. */
-    memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
+    temp_row	= prev_row;
+    prev_row	= curr_row;
+    curr_row	= temp_row;
     /* Calculate the values of the current row. */
@@ -81,25 +82,29 @@ VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2
     for (col=1; col<=l1; col++) {
       /* Equal (cost=0) or substitution (cost=1). */
-      curr_row[col]	= prev_row[col-1] + ((rb_str_cmp(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1)) == 0) ? 0 : 1);
+      value1	= prev_row[col-1] + ((rb_str_cmp(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1)) == 0) ? 0 : 1);
       /* Insertion if it's cheaper than substitution. */
-      if (prev_row[col]+1 < curr_row[col]) {
-        curr_row[col] = prev_row[col]+1;
+      value2	= prev_row[col]+1;
+      if (value2 < value1) {
+        value1	= value2;
       }
       /* Deletion if it's cheaper than substitution. */
-      if (curr_row[col-1]+1 < curr_row[col]) {
-        curr_row[col] = curr_row[col-1]+1;
+      value2	= curr_row[col-1]+1;
+      if (value2 < value1) {
+        value1	= value2;
       }
       /* Keep track of the minimum value on this row. */
-      if (curr_row[col] < curr_row_min) {
-        curr_row_min	= curr_row[col];
+      if (value1 < curr_row_min) {
+        curr_row_min	= value1;
       }
+      curr_row[col]	= value1;
     }
     /* Return nil as soon as we exceed the threshold. */

data/ext/levenshtein/levenshtein_fast.c CHANGED

@@ -1,4 +1,5 @@
 #include "ruby.h"
+#include "levenshtein.h"
 VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
   if ((TYPE(rb_o1) == T_STRING) && (TYPE(rb_o2)) == T_STRING) {
@@ -15,7 +16,7 @@ VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_t
 }
 void Init_levenshtein_fast() {
-  VALUE mLevenshtein	= rb_define_module("Levenshtein");
+  mLevenshtein	= rb_const_get(rb_mKernel, rb_intern("Levenshtein"));
-  rb_define_singleton_method(mLevenshtein, "levenshtein_distance_fast" , levenshtein_distance_fast, 3);
+  rb_define_singleton_method(mLevenshtein, "distance_fast" , levenshtein_distance_fast, 3);
 }

data/ext/levenshtein/levenshtein_generic.c CHANGED

@@ -1,16 +1,17 @@
 #include "ruby.h"
+#include "levenshtein.h"
 VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
   int	threshold;
   int	l1, l2;
-  int	*prev_row, *curr_row;
+  int	*prev_row, *curr_row, *temp_row;
   int	col, row;
   int	curr_row_min, result;
   int	offset;
+  int	value1, value2;
-  ID id_length	= rb_intern("length");
-  ID id_get	= rb_intern("[]");
-  ID id_equal	= rb_intern("==");
+  ID	id_length	= rb_intern("length");
+  ID	id_get		= rb_intern("[]");
   /* Get the sizes of both sequences. */
@@ -28,13 +29,14 @@ VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE r
   /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
   offset	= 0;
-  while RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(offset)))) {
+  while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset)), rb_funcall(rb_o2, id_get, 1, INT2FIX(offset))))) {
     offset++;
   }
   /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
-  while ((l1-1 > offset) && (l2-1 > offset) && RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(l1-1)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(l2-1))))) {
+  while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_funcall(rb_o1, id_get, 1, INT2FIX(l1-1)), rb_funcall(rb_o2, id_get, 1, INT2FIX(l2-1))))) {
     l1--;
     l2--;
   }
@@ -59,12 +61,8 @@ VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE r
   /* Allocate memory for both rows */
-  prev_row	= ALLOC_N(int, l1+1);
-  curr_row	= ALLOC_N(int, l1+1);
-  if ((prev_row == NULL) || (curr_row == NULL)) {
-    rb_raise(rb_eNoMemError, "out of memory");
-  }
+  prev_row	= (int*) ALLOC_N(int, (l1+1));
+  curr_row	= (int*) ALLOC_N(int, (l1+1));
   /* Initialize the current row. */
@@ -75,7 +73,9 @@ VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE r
   for (row=1; row<=l2; row++) {
     /* Copy the current row to the previous row. */
-    memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
+    temp_row	= prev_row;
+    prev_row	= curr_row;
+    curr_row	= temp_row;
     /* Calculate the values of the current row. */
@@ -85,25 +85,29 @@ VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE r
     for (col=1; col<=l1; col++) {
       /* Equal (cost=0) or substitution (cost=1). */
-      curr_row[col]	= prev_row[col-1] + (RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset+col-1)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(offset+row-1)))) ? 0 : 1);
+      value1	= prev_row[col-1] + (RTEST(rb_equal(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset+col-1)), rb_funcall(rb_o2, id_get, 1, INT2FIX(offset+row-1)))) ? 0 : 1);
       /* Insertion if it's cheaper than substitution. */
-      if (prev_row[col]+1 < curr_row[col]) {
-        curr_row[col] = prev_row[col]+1;
+      value2	= prev_row[col]+1;
+      if (value2 < value1) {
+        value1	= value2;
       }
       /* Deletion if it's cheaper than substitution. */
-      if (curr_row[col-1]+1 < curr_row[col]) {
-        curr_row[col] = curr_row[col-1]+1;
+      value2	= curr_row[col-1]+1;
+      if (value2 < value1) {
+        value1	= value2;
       }
       /* Keep track of the minimum value on this row. */
-      if (curr_row[col] < curr_row_min) {
-        curr_row_min	= curr_row[col];
+      if (value1 < curr_row_min) {
+        curr_row_min	= value1;
       }
+      curr_row[col]	= value1;
     }
     /* Return nil as soon as we exceed the threshold. */

data/ext/levenshtein/levenshtein_string.c CHANGED

@@ -1,25 +1,27 @@
 #include "ruby.h"
+#include "levenshtein.h"
 VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
   int	threshold;
   int	l1, l2;
-  int	*prev_row, *curr_row;
+  int	*prev_row, *curr_row, *temp_row;
   int	col, row;
   int	curr_row_min, result;
   int	offset;
+  int	value1, value2;
   char	*s1, *s2;
   /* Convert Ruby's s1 to C's s1. */
   rb_o1	= StringValue(rb_o1);
-  s1	= RSTRING(rb_o1)->ptr;
-  l1	= RSTRING(rb_o1)->len;
+  s1	= RSTRING_PTR(rb_o1);
+  l1	= RSTRING_LEN(rb_o1);
   /* Convert Ruby's s2 to C's s2. */
   rb_o2	= StringValue(rb_o2);
-  s2	= RSTRING(rb_o2)->ptr;
-  l2	= RSTRING(rb_o2)->len;
+  s2	= RSTRING_PTR(rb_o2);
+  l2	= RSTRING_LEN(rb_o2);
   /* Convert Ruby's threshold to C's threshold. */
@@ -32,13 +34,14 @@ VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb
   /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
   offset	= 0;
-  while (s1[offset] == s2[offset]) {
+  while ((offset < l1) && (offset < l2) && (s1[offset] == s2[offset])) {
     offset++;
   }
   /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
-  while ((l1-1 > offset) && (l2-1 > offset) && (s1[l1-1] == s2[l2-1])) {
+  while ((offset < l1) && (offset < l2) && (s1[l1-1] == s2[l2-1])) {
     l1--;
     l2--;
   }
@@ -63,12 +66,8 @@ VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb
   /* Allocate memory for both rows */
-  prev_row	= ALLOC_N(int, l1+1);
-  curr_row	= ALLOC_N(int, l1+1);
-  if ((prev_row == NULL) || (curr_row == NULL)) {
-    rb_raise(rb_eNoMemError, "out of memory");
-  }
+  prev_row	= (int*) ALLOC_N(int, (l1+1));
+  curr_row	= (int*) ALLOC_N(int, (l1+1));
   /* Initialize the current row. */
@@ -79,7 +78,9 @@ VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb
   for (row=1; row<=l2; row++) {
     /* Copy the current row to the previous row. */
-    memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
+    temp_row	= prev_row;
+    prev_row	= curr_row;
+    curr_row	= temp_row;
     /* Calculate the values of the current row. */
@@ -89,25 +90,29 @@ VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb
     for (col=1; col<=l1; col++) {
       /* Equal (cost=0) or substitution (cost=1). */
-      curr_row[col]	= prev_row[col-1] + ((s1[offset+col-1] == s2[offset+row-1]) ? 0 : 1);
+      value1	= prev_row[col-1] + ((s1[offset+col-1] == s2[offset+row-1]) ? 0 : 1);
       /* Insertion if it's cheaper than substitution. */
-      if (prev_row[col]+1 < curr_row[col]) {
-        curr_row[col] = prev_row[col]+1;
+      value2	= prev_row[col]+1;
+      if (value2 < value1) {
+        value1	= value2;
       }
       /* Deletion if it's cheaper than substitution. */
-      if (curr_row[col-1]+1 < curr_row[col]) {
-        curr_row[col] = curr_row[col-1]+1;
+      value2	= curr_row[col-1]+1;
+      if (value2 < value1) {
+        value1	= value2;
       }
       /* Keep track of the minimum value on this row. */
-      if (curr_row[col] < curr_row_min) {
-        curr_row_min	= curr_row[col];
+      if (value1 < curr_row_min) {
+        curr_row_min	= value1;
       }
+      curr_row[col]	= value1;
     }
     /* Return nil as soon as we exceed the threshold. */

data/lib/levenshtein.rb CHANGED

@@ -1,44 +1,25 @@
-begin
-  require "levenshtein/levenshtein_fast"	# If compiled by RubyGems.
-rescue LoadError
-  begin
-    require "levenshtein_fast"			# If compiled by the build script.
-  rescue LoadError
-    $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.distance. Using the much slower Ruby version instead."
-  end
-end
-# The Levenshtein distance is a metric for measuring the amount
-# of difference between two sequences (i.e., the so called edit
-# distance). The Levenshtein distance between two sequences is
-# given by the minimum number of operations needed to transform
-# one sequence into the other, where an operation is an
-# insertion, deletion, or substitution of a single element.
-#
-# More information about the Levenshtein distance algorithm:
-# http://en.wikipedia.org/wiki/Levenshtein_distance .
+require "levenshtein/exception"
+require "levenshtein/version"
 module Levenshtein
-  VERSION	= "0.2.0"
   # Returns the Levenshtein distance as a number between 0.0 and
   # 1.0. It's basically the Levenshtein distance divided by the
   # length of the longest sequence.
-  def self.normalized_distance(s1, s2, threshold=nil)
-    s1, s2	= s2, s1	if s1.length > s2.length	# s1 is the short one; s2 is the long one.
+  def self.normalized_distance(a1, a2, threshold=nil)
+    a1, a2	= a2, a1	if a1.length > a2.length	# a1 is the short one; a2 is the long one.
-    if s2.length == 0
-      0.0	# Since s1.length < s2.length, s1 must be empty as well.
+    if a2.length == 0
+      0.0	# Since a1.length < a2.length, a1 must be empty as well.
     else
       if threshold
-        if d = self.distance(s1, s2, (threshold*s2.length+1).to_i)
-          d.to_f/s2.length
+        if d = self.distance(a1, a2, (threshold*a2.length+1).to_i)
+          d.to_f/a2.length
         else
           nil
         end
       else
-        self.distance(s1, s2).to_f/s2.length
+        self.distance(a1, a2).to_f/a2.length
       end
     end
   end
@@ -53,47 +34,64 @@ module Levenshtein
   # The sequences should respond to :length and :[] and all objects
   # in the sequences (as returned by []) should response to :==.
-  def self.distance(s1, s2, threshold=nil)
-    s1, s2	= s2, s1	if s1.length > s2.length	# s1 is the short one; s2 is the long one.
+  def self.distance(a1, a2, threshold=nil)
+    a1, a2	= a2, a1	if a1.length > a2.length	# a1 is the short one; a2 is the long one.
     # Handle some basic circumstances.
-    return 0		if s1 == s2
-    return s2.length	if s1.length == 0
+    return 0		if a1 == a2
+    return a2.length	if a1.length == 0
     if threshold
-      return nil	if (s2.length-s1.length) >= threshold
+      return nil	if (a2.length-a1.length) >= threshold
-      a1, a2	= nil, nil
-      a1, a2	= s1, s2			if s1.respond_to?(:-) and s2.respond_to?(:-)
-      a1, a2	= s1.scan(/./), s2.scan(/./)	if s1.respond_to?(:scan) and s2.respond_to?(:scan)
+      a3, a4	= nil, nil
+      a3, a4	= a1, a2			if a1.respond_to?(:-) and a2.respond_to?(:-)
+      a3, a4	= a1.scan(/./), a2.scan(/./)	if a1.respond_to?(:scan) and a2.respond_to?(:scan)
-      if a1 and a2
-        return nil	if (a1-a2).length >= threshold
-        return nil	if (a2-a1).length >= threshold
+      if a3 and a4
+        return nil	if (a3-a4).length >= threshold
+        return nil	if (a4-a3).length >= threshold
       end
     end
-    distance_fast_or_slow(s1, s2, threshold)
+    distance_fast_or_slow(a1, a2, threshold)
   end
-  def self.distance_fast_or_slow(s1, s2, threshold)	# :nodoc:
-    if respond_to?(:levenshtein_distance_fast)
-      levenshtein_distance_fast(s1, s2, threshold)	# Implemented in C.
+  def self.distance_fast_or_slow(a1, a2, threshold)	# :nodoc:
+    if respond_to?(:distance_fast)
+      distance_fast(a1, a2, threshold)	# Implemented in C.
     else
-      levenshtein_distance_slow(s1, s2, threshold)	# Implemented in Ruby.
+      distance_slow(a1, a2, threshold)	# Implemented in Ruby.
     end
   end
-  def self.levenshtein_distance_slow(s1, s2, threshold)	# :nodoc:
-    row	= (0..s1.length).to_a
+  def self.distance_slow(a1, a2, threshold)	# :nodoc:
+    l1	= a1.length
+    l2	= a2.length
-    1.upto(s2.length) do |y|
-      prow	= row
-      row	= [y]
+    offset	= 0
+    while offset < l1 and offset < l2 and a1[offset] == a2[offset]
+      offset += 1
+    end
+    while offset < l1 and offset < l2 and a1[l1-1] == a2[l2-1]
+      l1 -= 1
+      l2 -= 1
+    end
+    l1 -= offset
+    l2 -= offset
-      1.upto(s1.length) do |x|
-        row[x]	= [prow[x]+1, row[x-1]+1, prow[x-1]+(s1[x-1]==s2[y-1] ? 0 : 1)].min
+    crow	= (0..l1).to_a
+    1.upto(l2) do |y|
+      prow	= crow
+      crow	= [y]
+      1.upto(l1) do |x|
+        crow[x]	= [prow[x]+1, crow[x-1]+1, prow[x-1]+(a1[offset+x-1]==a2[offset+y-1] ? 0 : 1)].min
       end
       # Stop analysing this sequence as soon as the best possible
@@ -101,9 +99,19 @@ module Levenshtein
       # (The minimum value in the next row will be equal to or greater
       # than the minimum value in this row.)
-      return nil	if threshold and row.min >= threshold
+      return nil	if threshold and crow.min >= threshold
     end
-    row[-1]
+    crow[-1]
+  end
+end
+begin
+  require "levenshtein/levenshtein_fast"	# Compiled by RubyGems.
+rescue LoadError
+  begin
+    require "levenshtein_fast"			# Compiled by the build script.
+  rescue LoadError
+    $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein. Using the much slower Ruby version instead."
   end
 end

data/lib/levenshtein/exception.rb ADDED

@@ -0,0 +1,4 @@
+module Levenshtein
+  class LevenshteinException < RuntimeError
+  end
+end

data/lib/levenshtein/version.rb ADDED

@@ -0,0 +1,3 @@
+module Levenshtein
+  VERSION	= "0.2.1"
+end

data/test/test.rb CHANGED

@@ -12,6 +12,8 @@ module Levenshtein
     end
     def [](pos)
+      raise "type not allowed [#{pos.inspect}]"	unless pos.kind_of?(Fixnum)
       @sequence[pos]
     end
   end
@@ -105,21 +107,31 @@ end
 class TestLevenshteinSlow < Test::Unit::TestCase
   def test_erik_veenstra
-    assert_equal(7, Levenshtein.levenshtein_distance_slow("erik", "veenstra", nil))
+    assert_equal(7, Levenshtein.distance_slow("erik", "veenstra", nil))
+    assert_equal(7, Levenshtein.distance_slow("veenstra", "erik", nil))
   end
-  def test_empty_sequence
-    assert_equal(0, Levenshtein.levenshtein_distance_slow("", "", nil))
-    assert_equal(3, Levenshtein.levenshtein_distance_slow("", "foo", nil))
+  def test_empty_string
+    assert_equal(0, Levenshtein.distance_slow("", "", nil))
+    assert_equal(3, Levenshtein.distance_slow("", "foo", nil))
+    assert_equal(3, Levenshtein.distance_slow("foo", "", nil))
   end
-  def test_same_sequence
-    assert_equal(0, Levenshtein.levenshtein_distance_slow("", "", nil))
-    assert_equal(0, Levenshtein.levenshtein_distance_slow("foo", "foo", nil))
+  def test_same_string
+    assert_equal(0, Levenshtein.distance_slow("", "", nil))
+    assert_equal(0, Levenshtein.distance_slow("foo", "foo", nil))
   end
   def test_threshold
-    assert_equal(3, Levenshtein.levenshtein_distance_slow("foo", "foobar", nil))
-    assert_equal(nil, Levenshtein.levenshtein_distance_slow("foo", "foobar", 2))
+    assert_equal(3, Levenshtein.distance_slow("foo", "foobar", nil))
+    assert_equal(3, Levenshtein.distance_slow("foo", "foobar", 4))
+    assert_equal(nil, Levenshtein.distance_slow("foo", "foobar", 2))
+  end
+  def test_same_head_and_or_tail
+    assert_equal(3, Levenshtein.distance_slow("ab123cd", "abxyzcd", nil))
+    assert_equal(3, Levenshtein.distance_slow("ab123", "abxyz", nil))
+    assert_equal(3, Levenshtein.distance_slow("123cd", "xyzcd", nil))
+    assert_equal(5, Levenshtein.distance_slow("123cd123", "123", nil))
   end
 end

metadata CHANGED

@@ -1,7 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: levenshtein
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  hash: 21
+  prerelease:
+  segments:
+  - 0
+  - 2
+  - 1
+  version: 0.2.1
 platform: ruby
 authors:
 - Erik Veenstra
@@ -9,8 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-07-11 00:00:00 +02:00
-default_executable:
+date: 2012-02-11 00:00:00 Z
 dependencies: []
 description: Calculates the Levenshtein distance between two byte strings.
@@ -22,18 +27,21 @@ extensions:
 extra_rdoc_files: []
 files:
+- lib/levenshtein/exception.rb
+- lib/levenshtein/version.rb
 - lib/levenshtein.rb
-- ext/levenshtein/extconf.rb
-- ext/levenshtein/levenshtein_array_of_strings.c
-- ext/levenshtein/levenshtein_fast.c
 - ext/levenshtein/levenshtein_string.c
 - ext/levenshtein/levenshtein_generic.c
+- ext/levenshtein/levenshtein.h
+- ext/levenshtein/levenshtein_fast.c
+- ext/levenshtein/levenshtein_array_of_strings.c
 - ext/levenshtein/levenshtein_array.c
+- ext/levenshtein/extconf.rb
 - README
 - LICENSE
 - VERSION
 - CHANGELOG
-has_rdoc: true
+- test/test.rb
 homepage: http://www.erikveen.dds.nl/levenshtein/index.html
 licenses: []
@@ -44,27 +52,33 @@ rdoc_options:
 - VERSION
 - CHANGELOG
 - --title
-- levenshtein (0.2.0)
+- levenshtein (0.2.1)
 - --main
 - README
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
       version: "0"
-  version:
 required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
       version: "0"
-  version:
 requirements: []
 rubyforge_project: levenshtein
-rubygems_version: 1.3.4
+rubygems_version: 1.8.12
 signing_key:
 specification_version: 3
 summary: Calculates the Levenshtein distance between two byte strings.