RubyGems - levenshtein-19 - Versions diffs - 0.3.0 - Mend

levenshtein-19 0.3.0

Files changed (17) hide show

data/.gitignore +5 -0
data/CHANGELOG +26 -0
data/Gemfile +4 -0
data/LICENSE +15 -0
data/README +15 -0
data/Rakefile +1 -0
data/ext/levenshtein/extconf.rb +10 -0
data/ext/levenshtein/levenshtein_array.c +127 -0
data/ext/levenshtein/levenshtein_array_of_strings.c +125 -0
data/ext/levenshtein/levenshtein_fast.c +21 -0
data/ext/levenshtein/levenshtein_generic.c +129 -0
data/ext/levenshtein/levenshtein_string.c +133 -0
data/levenshtein-19.gemspec +21 -0
data/lib/levenshtein/version.rb +3 -0
data/lib/levenshtein.rb +109 -0
data/test/test.rb +125 -0
metadata +64 -0

data/.gitignore ADDED Viewed

@@ -0,0 +1,5 @@
+*.gem
+.bundle
+Gemfile.lock
+pkg/*
+tmp/*

data/CHANGELOG ADDED Viewed

@@ -0,0 +1,26 @@
+0.3.0 (11-20-2011)
+* Use RARRAY_LEN, RSTRING_LEN, and RSTRING_PTR for 1.9 compatibility.
+0.2.0 (11-07-2009)
+* Return 0 instead of 0.0 in case of empty strings.
+* Added specific support for arrays.
+* Added specific support for arrays of strings.
+* Added generic support for all (?) kind of sequences.
+* Moved a lot of code to the C world.
+0.1.1 (06-10-2008)
+* If one of the strings was both the begin and the end of the
+  other string, it would be stripped from both ends. Example:
+  Levenshtein.distance("abracadabra", "abra") resulted in 3
+  instead of 7. It's fixed now.
+0.1.0 (24-05-2008)
+* First release.

data/Gemfile ADDED Viewed

@@ -0,0 +1,4 @@
+source "http://rubygems.org"
+# Specify your gem's dependencies in levenshtein-19.gemspec
+gemspec

data/LICENSE ADDED Viewed

@@ -0,0 +1,15 @@
+# Copyright Erik Veenstra <levenshtein@erikveen.dds.nl>
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License,
+# version 2, as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be
+# useful, but WITHOUT ANY WARRANTY; without even the implied
+# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+# PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public
+# License along with this program; if not, write to the Free
+# Software Foundation, Inc., 59 Temple Place, Suite 330,
+# Boston, MA 02111-1307 USA.

data/README ADDED Viewed

@@ -0,0 +1,15 @@
+The Levenshtein distance is a metric for measuring the amount of difference
+between two sequences (i.e., the so called edit distance). The Levenshtein
+distance between two sequences is given by the minimum number of operations
+needed to transform one sequence into the other, where an operation is an
+insertion, deletion, or substitution of a single element.
+The two sequences can be two strings, two arrays, or two other objects.
+Strings, arrays and arrays of strings are handled with optimized (very fast) C
+code. All other sequences are handled with generic (fast) C code.
+More information about the Levenshtein distance algorithm:
+http://en.wikipedia.org/wiki/Levenshtein_distance .
+NOTE: This gem was written by Erik Veenstra. I have made slight modifications
+to it for compatibility with Ruby 1.9.

data/Rakefile ADDED Viewed

	@@ -0,0 +1 @@
1	+ require "bundler/gem_tasks"

data/ext/levenshtein/extconf.rb ADDED Viewed

@@ -0,0 +1,10 @@
+require "mkmf"
+dir_config("levenshtein")
+have_library("levenshtein_array")
+have_library("levenshtein_array_of_strings")
+have_library("levenshtein_generic")
+have_library("levenshtein_string")
+create_makefile("levenshtein/levenshtein_fast")

data/ext/levenshtein/levenshtein_array.c ADDED Viewed

@@ -0,0 +1,127 @@
+#include "ruby.h"
+VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
+  int	threshold;
+  int	l1, l2;
+  int	*prev_row, *curr_row;
+  int	col, row;
+  int	curr_row_min, result;
+  int	offset;
+  ID id_eql	= rb_intern("==");
+  /* Get the sizes of both arrays. */
+  l1	= RARRAY_LEN(rb_o1);
+  l2	= RARRAY_LEN(rb_o2);
+  /* Convert Ruby's threshold to C's threshold. */
+  if (!NIL_P(rb_threshold)) {
+    threshold	= FIX2INT(rb_threshold);
+  } else {
+    threshold	= -1;
+  }
+  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
+  offset	= 0;
+  while RTEST(rb_funcall(rb_ary_entry(rb_o1, offset), id_eql, 1, rb_ary_entry(rb_o2, offset))) {
+    offset++;
+  }
+  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
+  while ((l1-1 > offset) && (l2-1 > offset) && RTEST(rb_funcall(rb_ary_entry(rb_o1, l1-1), id_eql, 1, rb_ary_entry(rb_o2, l2-1)))) {
+    l1--;
+    l2--;
+  }
+  l1 -= offset;
+  l2 -= offset;
+  /* The Levenshtein algorithm itself. */
+  /*       s1=              */
+  /*       ERIK             */
+  /*                        */
+  /*      01234             */
+  /* s2=V 11234             */
+  /*    E 21234             */
+  /*    E 32234             */
+  /*    N 43334 <- prev_row */
+  /*    S 54444 <- curr_row */
+  /*    T 65555             */
+  /*    R 76566             */
+  /*    A 87667             */
+  /* Allocate memory for both rows */
+  prev_row	= ALLOC_N(int, l1+1);
+  curr_row	= ALLOC_N(int, l1+1);
+  if ((prev_row == NULL) || (curr_row == NULL)) {
+    rb_raise(rb_eNoMemError, "out of memory");
+  }
+  /* Initialize the current row. */
+  for (col=0; col<=l1; col++) {
+    curr_row[col]	= col;
+  }
+  for (row=1; row<=l2; row++) {
+    /* Copy the current row to the previous row. */
+    memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
+    /* Calculate the values of the current row. */
+    curr_row[0]		= row;
+    curr_row_min	= row;
+    for (col=1; col<=l1; col++) {
+      /* Equal (cost=0) or substitution (cost=1). */
+      curr_row[col]	= prev_row[col-1] + (RTEST(rb_funcall(rb_ary_entry(rb_o1, offset+col-1), id_eql, 1, rb_ary_entry(rb_o2, offset+row-1))) ? 0 : 1);
+      /* Insertion if it's cheaper than substitution. */
+      if (prev_row[col]+1 < curr_row[col]) {
+        curr_row[col] = prev_row[col]+1;
+      }
+      /* Deletion if it's cheaper than substitution. */
+      if (curr_row[col-1]+1 < curr_row[col]) {
+        curr_row[col] = curr_row[col-1]+1;
+      }
+      /* Keep track of the minimum value on this row. */
+      if (curr_row[col] < curr_row_min) {
+        curr_row_min	= curr_row[col];
+      }
+    }
+    /* Return nil as soon as we exceed the threshold. */
+    if (threshold > -1 && curr_row_min >= threshold) {
+      free(prev_row);
+      free(curr_row);
+      return Qnil;
+    }
+  }
+  /* The result is the last value on the last row. */
+  result	= curr_row[l1];
+  free(prev_row);
+  free(curr_row);
+  /* Return the Ruby version of the result. */
+  return INT2FIX(result);
+}

data/ext/levenshtein/levenshtein_array_of_strings.c ADDED Viewed

@@ -0,0 +1,125 @@
+#include "ruby.h"
+VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
+  int	threshold;
+  int	l1, l2;
+  int	*prev_row, *curr_row;
+  int	col, row;
+  int	curr_row_min, result;
+  int	offset;
+  /* Get the sizes of both arrays. */
+  l1	= RARRAY_LEN(rb_o1);
+  l2	= RARRAY_LEN(rb_o2);
+  /* Convert Ruby's threshold to C's threshold. */
+  if (!NIL_P(rb_threshold)) {
+    threshold	= FIX2INT(rb_threshold);
+  } else {
+    threshold	= -1;
+  }
+  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
+  offset	= 0;
+  while (rb_str_cmp(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)) == 0) {
+    offset++;
+  }
+  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
+  while ((l1-1 > offset) && (l2-1 > offset) && (rb_str_cmp(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)) == 0 )) {
+    l1--;
+    l2--;
+  }
+  l1 -= offset;
+  l2 -= offset;
+  /* The Levenshtein algorithm itself. */
+  /*       s1=              */
+  /*       ERIK             */
+  /*                        */
+  /*      01234             */
+  /* s2=V 11234             */
+  /*    E 21234             */
+  /*    E 32234             */
+  /*    N 43334 <- prev_row */
+  /*    S 54444 <- curr_row */
+  /*    T 65555             */
+  /*    R 76566             */
+  /*    A 87667             */
+  /* Allocate memory for both rows */
+  prev_row	= ALLOC_N(int, l1+1);
+  curr_row	= ALLOC_N(int, l1+1);
+  if ((prev_row == NULL) || (curr_row == NULL)) {
+    rb_raise(rb_eNoMemError, "out of memory");
+  }
+  /* Initialize the current row. */
+  for (col=0; col<=l1; col++) {
+    curr_row[col]	= col;
+  }
+  for (row=1; row<=l2; row++) {
+    /* Copy the current row to the previous row. */
+    memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
+    /* Calculate the values of the current row. */
+    curr_row[0]		= row;
+    curr_row_min	= row;
+    for (col=1; col<=l1; col++) {
+      /* Equal (cost=0) or substitution (cost=1). */
+      curr_row[col]	= prev_row[col-1] + ((rb_str_cmp(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1)) == 0) ? 0 : 1);
+      /* Insertion if it's cheaper than substitution. */
+      if (prev_row[col]+1 < curr_row[col]) {
+        curr_row[col] = prev_row[col]+1;
+      }
+      /* Deletion if it's cheaper than substitution. */
+      if (curr_row[col-1]+1 < curr_row[col]) {
+        curr_row[col] = curr_row[col-1]+1;
+      }
+      /* Keep track of the minimum value on this row. */
+      if (curr_row[col] < curr_row_min) {
+        curr_row_min	= curr_row[col];
+      }
+    }
+    /* Return nil as soon as we exceed the threshold. */
+    if (threshold > -1 && curr_row_min >= threshold) {
+      free(prev_row);
+      free(curr_row);
+      return Qnil;
+    }
+  }
+  /* The result is the last value on the last row. */
+  result	= curr_row[l1];
+  free(prev_row);
+  free(curr_row);
+  /* Return the Ruby version of the result. */
+  return INT2FIX(result);
+}

data/ext/levenshtein/levenshtein_fast.c ADDED Viewed

@@ -0,0 +1,21 @@
+#include "ruby.h"
+VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
+  if ((TYPE(rb_o1) == T_STRING) && (TYPE(rb_o2)) == T_STRING) {
+    return levenshtein_distance_string(self, rb_o1, rb_o2, rb_threshold);
+  } else if ((TYPE(rb_o1) == T_ARRAY) && (TYPE(rb_o2)) == T_ARRAY) {
+    if ((TYPE(rb_ary_entry(rb_o1, 0)) == T_STRING) && (TYPE(rb_ary_entry(rb_o2, 0))) == T_STRING) {
+      return levenshtein_distance_array_of_strings(self, rb_o1, rb_o2, rb_threshold);
+    } else {
+      return levenshtein_distance_array(self, rb_o1, rb_o2, rb_threshold);
+    }
+  } else {
+    return levenshtein_distance_generic(self, rb_o1, rb_o2, rb_threshold);
+  }
+}
+void Init_levenshtein_fast() {
+  VALUE mLevenshtein	= rb_define_module("Levenshtein");
+  rb_define_singleton_method(mLevenshtein, "levenshtein_distance_fast" , levenshtein_distance_fast, 3);
+}

data/ext/levenshtein/levenshtein_generic.c ADDED Viewed

@@ -0,0 +1,129 @@
+#include "ruby.h"
+VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
+  int	threshold;
+  int	l1, l2;
+  int	*prev_row, *curr_row;
+  int	col, row;
+  int	curr_row_min, result;
+  int	offset;
+  ID id_length	= rb_intern("length");
+  ID id_get	= rb_intern("[]");
+  ID id_equal	= rb_intern("==");
+  /* Get the sizes of both sequences. */
+  l1	= FIX2INT(rb_funcall(rb_o1, id_length, 0));
+  l2	= FIX2INT(rb_funcall(rb_o2, id_length, 0));
+  /* Convert Ruby's threshold to C's threshold. */
+  if (!NIL_P(rb_threshold)) {
+    threshold	= FIX2INT(rb_threshold);
+  } else {
+    threshold	= -1;
+  }
+  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
+  offset	= 0;
+  while RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(offset)))) {
+    offset++;
+  }
+  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
+  while ((l1-1 > offset) && (l2-1 > offset) && RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(l1-1)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(l2-1))))) {
+    l1--;
+    l2--;
+  }
+  l1 -= offset;
+  l2 -= offset;
+  /* The Levenshtein algorithm itself. */
+  /*       s1=              */
+  /*       ERIK             */
+  /*                        */
+  /*      01234             */
+  /* s2=V 11234             */
+  /*    E 21234             */
+  /*    E 32234             */
+  /*    N 43334 <- prev_row */
+  /*    S 54444 <- curr_row */
+  /*    T 65555             */
+  /*    R 76566             */
+  /*    A 87667             */
+  /* Allocate memory for both rows */
+  prev_row	= ALLOC_N(int, l1+1);
+  curr_row	= ALLOC_N(int, l1+1);
+  if ((prev_row == NULL) || (curr_row == NULL)) {
+    rb_raise(rb_eNoMemError, "out of memory");
+  }
+  /* Initialize the current row. */
+  for (col=0; col<=l1; col++) {
+    curr_row[col]	= col;
+  }
+  for (row=1; row<=l2; row++) {
+    /* Copy the current row to the previous row. */
+    memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
+    /* Calculate the values of the current row. */
+    curr_row[0]		= row;
+    curr_row_min	= row;
+    for (col=1; col<=l1; col++) {
+      /* Equal (cost=0) or substitution (cost=1). */
+      curr_row[col]	= prev_row[col-1] + (RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset+col-1)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(offset+row-1)))) ? 0 : 1);
+      /* Insertion if it's cheaper than substitution. */
+      if (prev_row[col]+1 < curr_row[col]) {
+        curr_row[col] = prev_row[col]+1;
+      }
+      /* Deletion if it's cheaper than substitution. */
+      if (curr_row[col-1]+1 < curr_row[col]) {
+        curr_row[col] = curr_row[col-1]+1;
+      }
+      /* Keep track of the minimum value on this row. */
+      if (curr_row[col] < curr_row_min) {
+        curr_row_min	= curr_row[col];
+      }
+    }
+    /* Return nil as soon as we exceed the threshold. */
+    if (threshold > -1 && curr_row_min >= threshold) {
+      free(prev_row);
+      free(curr_row);
+      return Qnil;
+    }
+  }
+  /* The result is the last value on the last row. */
+  result	= curr_row[l1];
+  free(prev_row);
+  free(curr_row);
+  /* Return the Ruby version of the result. */
+  return INT2FIX(result);
+}

data/ext/levenshtein/levenshtein_string.c ADDED Viewed

@@ -0,0 +1,133 @@
+#include "ruby.h"
+VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
+  int	threshold;
+  int	l1, l2;
+  int	*prev_row, *curr_row;
+  int	col, row;
+  int	curr_row_min, result;
+  int	offset;
+  char	*s1, *s2;
+  /* Convert Ruby's s1 to C's s1. */
+  rb_o1	= StringValue(rb_o1);
+  s1	= RSTRING_PTR(rb_o1);
+  l1	= RSTRING_LEN(rb_o1);
+  /* Convert Ruby's s2 to C's s2. */
+  rb_o2	= StringValue(rb_o2);
+  s2	= RSTRING_PTR(rb_o2);
+  l2	= RSTRING_LEN(rb_o2);
+  /* Convert Ruby's threshold to C's threshold. */
+  if (!NIL_P(rb_threshold)) {
+    threshold	= FIX2INT(rb_threshold);
+  } else {
+    threshold	= -1;
+  }
+  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
+  offset	= 0;
+  while (s1[offset] == s2[offset]) {
+    offset++;
+  }
+  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
+  while ((l1-1 > offset) && (l2-1 > offset) && (s1[l1-1] == s2[l2-1])) {
+    l1--;
+    l2--;
+  }
+  l1 -= offset;
+  l2 -= offset;
+  /* The Levenshtein algorithm itself. */
+  /*       s1=              */
+  /*       ERIK             */
+  /*                        */
+  /*      01234             */
+  /* s2=V 11234             */
+  /*    E 21234             */
+  /*    E 32234             */
+  /*    N 43334 <- prev_row */
+  /*    S 54444 <- curr_row */
+  /*    T 65555             */
+  /*    R 76566             */
+  /*    A 87667             */
+  /* Allocate memory for both rows */
+  prev_row	= ALLOC_N(int, l1+1);
+  curr_row	= ALLOC_N(int, l1+1);
+  if ((prev_row == NULL) || (curr_row == NULL)) {
+    rb_raise(rb_eNoMemError, "out of memory");
+  }
+  /* Initialize the current row. */
+  for (col=0; col<=l1; col++) {
+    curr_row[col]	= col;
+  }
+  for (row=1; row<=l2; row++) {
+    /* Copy the current row to the previous row. */
+    memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
+    /* Calculate the values of the current row. */
+    curr_row[0]		= row;
+    curr_row_min	= row;
+    for (col=1; col<=l1; col++) {
+      /* Equal (cost=0) or substitution (cost=1). */
+      curr_row[col]	= prev_row[col-1] + ((s1[offset+col-1] == s2[offset+row-1]) ? 0 : 1);
+      /* Insertion if it's cheaper than substitution. */
+      if (prev_row[col]+1 < curr_row[col]) {
+        curr_row[col] = prev_row[col]+1;
+      }
+      /* Deletion if it's cheaper than substitution. */
+      if (curr_row[col-1]+1 < curr_row[col]) {
+        curr_row[col] = curr_row[col-1]+1;
+      }
+      /* Keep track of the minimum value on this row. */
+      if (curr_row[col] < curr_row_min) {
+        curr_row_min	= curr_row[col];
+      }
+    }
+    /* Return nil as soon as we exceed the threshold. */
+    if (threshold > -1 && curr_row_min >= threshold) {
+      free(prev_row);
+      free(curr_row);
+      return Qnil;
+    }
+  }
+  /* The result is the last value on the last row. */
+  result	= curr_row[l1];
+  free(prev_row);
+  free(curr_row);
+  /* Return the Ruby version of the result. */
+  return INT2FIX(result);
+}

data/levenshtein-19.gemspec ADDED Viewed

@@ -0,0 +1,21 @@
+# -*- encoding: utf-8 -*-
+$:.push File.expand_path("../lib", __FILE__)
+require "levenshtein/version"
+Gem::Specification.new do |s|
+  s.name        = "levenshtein-19"
+  s.version     = Levenshtein::VERSION
+  s.authors     = ["Erik Veenstra", "Ryan Fitzgerald"]
+  s.email       = ["rwfitzge@gmail.com"]
+  s.homepage    = "http://github.com/rwfitzge/levenshtein-19"
+  s.summary     = %q{Calculates the Levenshtein distance between two byte strings.}
+  s.description = %q{Calculates the Levenshtein distance between two byte strings.}
+  s.rubyforge_project = "levenshtein-19"
+  s.files         = `git ls-files`.split("\n")
+  s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
+  s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+  s.require_paths = ["lib"]
+  s.extensions    = ["ext/levenshtein/extconf.rb"]
+end

data/lib/levenshtein/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module Levenshtein
+  VERSION = "0.3.0"
+end

data/lib/levenshtein.rb ADDED Viewed

@@ -0,0 +1,109 @@
+require "levenshtein/version"
+begin
+  require "levenshtein/levenshtein_fast"	# If compiled by RubyGems.
+rescue LoadError
+  begin
+    require "levenshtein_fast"			# If compiled by the build script.
+  rescue LoadError
+    $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.distance. Using the much slower Ruby version instead."
+  end
+end
+# The Levenshtein distance is a metric for measuring the amount
+# of difference between two sequences (i.e., the so called edit
+# distance). The Levenshtein distance between two sequences is
+# given by the minimum number of operations needed to transform
+# one sequence into the other, where an operation is an
+# insertion, deletion, or substitution of a single element.
+#
+# More information about the Levenshtein distance algorithm:
+# http://en.wikipedia.org/wiki/Levenshtein_distance .
+module Levenshtein
+  # Returns the Levenshtein distance as a number between 0.0 and
+  # 1.0. It's basically the Levenshtein distance divided by the
+  # length of the longest sequence.
+  def self.normalized_distance(s1, s2, threshold=nil)
+    s1, s2	= s2, s1	if s1.length > s2.length	# s1 is the short one; s2 is the long one.
+    if s2.length == 0
+      0.0	# Since s1.length < s2.length, s1 must be empty as well.
+    else
+      if threshold
+        if d = self.distance(s1, s2, (threshold*s2.length+1).to_i)
+          d.to_f/s2.length
+        else
+          nil
+        end
+      else
+        self.distance(s1, s2).to_f/s2.length
+      end
+    end
+  end
+  # Returns the Levenshtein distance between two sequences.
+  #
+  # The two sequences can be two strings, two arrays, or two other
+  # objects. Strings, arrays and arrays of strings are handled with
+  # optimized (very fast) C code. All other sequences are handled
+  # with generic (fast) C code.
+  #
+  # The sequences should respond to :length and :[] and all objects
+  # in the sequences (as returned by []) should response to :==.
+  def self.distance(s1, s2, threshold=nil)
+    s1, s2	= s2, s1	if s1.length > s2.length	# s1 is the short one; s2 is the long one.
+    # Handle some basic circumstances.
+    return 0		if s1 == s2
+    return s2.length	if s1.length == 0
+    if threshold
+      return nil	if (s2.length-s1.length) >= threshold
+      a1, a2	= nil, nil
+      a1, a2	= s1, s2			if s1.respond_to?(:-) and s2.respond_to?(:-)
+      a1, a2	= s1.scan(/./), s2.scan(/./)	if s1.respond_to?(:scan) and s2.respond_to?(:scan)
+      if a1 and a2
+        return nil	if (a1-a2).length >= threshold
+        return nil	if (a2-a1).length >= threshold
+      end
+    end
+    distance_fast_or_slow(s1, s2, threshold)
+  end
+  def self.distance_fast_or_slow(s1, s2, threshold)	# :nodoc:
+    if respond_to?(:levenshtein_distance_fast)
+      levenshtein_distance_fast(s1, s2, threshold)	# Implemented in C.
+    else
+      levenshtein_distance_slow(s1, s2, threshold)	# Implemented in Ruby.
+    end
+  end
+  def self.levenshtein_distance_slow(s1, s2, threshold)	# :nodoc:
+    row	= (0..s1.length).to_a
+    1.upto(s2.length) do |y|
+      prow	= row
+      row	= [y]
+      1.upto(s1.length) do |x|
+        row[x]	= [prow[x]+1, row[x-1]+1, prow[x-1]+(s1[x-1]==s2[y-1] ? 0 : 1)].min
+      end
+      # Stop analysing this sequence as soon as the best possible
+      # result for this sequence is bigger than the best result so far.
+      # (The minimum value in the next row will be equal to or greater
+      # than the minimum value in this row.)
+      return nil	if threshold and row.min >= threshold
+    end
+    row[-1]
+  end
+end

data/test/test.rb ADDED Viewed

@@ -0,0 +1,125 @@
+require "test/unit"
+require "levenshtein"
+module Levenshtein
+  class TestSequence
+    def initialize(o)
+      @sequence	= o
+    end
+    def length
+      @sequence.length
+    end
+    def [](pos)
+      @sequence[pos]
+    end
+  end
+  class TestElement
+    attr_reader :object
+    def initialize(o)
+      @object	= o
+    end
+    def ==(other)
+      @object == other.object
+    end
+  end
+end
+class TestLevenshteinString < Test::Unit::TestCase
+  def test_erik_veenstra
+    assert_equal(7, Levenshtein.distance("erik", "veenstra"))
+    assert_equal(7, Levenshtein.distance("veenstra", "erik"))
+    assert_in_delta(0.875, Levenshtein.normalized_distance("erik", "veenstra"), 0.01)
+    assert_in_delta(0.875, Levenshtein.normalized_distance("veenstra", "erik"), 0.01)
+  end
+  def test_empty_string
+    assert_equal(0, Levenshtein.distance("", ""))
+    assert_equal(3, Levenshtein.distance("", "foo"))
+    assert_equal(3, Levenshtein.distance("foo", ""))
+    assert_in_delta(0.0, Levenshtein.normalized_distance("", ""), 0.01)
+    assert_in_delta(1.0, Levenshtein.normalized_distance("", "foo"), 0.01)
+    assert_in_delta(1.0, Levenshtein.normalized_distance("foo", ""), 0.01)
+  end
+  def test_same_string
+    assert_equal(0, Levenshtein.distance("", ""))
+    assert_equal(0, Levenshtein.distance("foo", "foo"))
+    assert_in_delta(0.0, Levenshtein.normalized_distance("", ""), 0.01)
+    assert_in_delta(0.0, Levenshtein.normalized_distance("foo", "foo"), 0.01)
+  end
+  def test_threshold
+    assert_equal(3, Levenshtein.distance("foo", "foobar"))
+    assert_equal(3, Levenshtein.distance("foo", "foobar", 4))
+    assert_equal(nil, Levenshtein.distance("foo", "foobar", 2))
+    assert_in_delta(0.5, Levenshtein.normalized_distance("foo", "foobar"), 0.01)
+    assert_in_delta(0.5, Levenshtein.normalized_distance("foo", "foobar", 0.66), 0.01)
+    assert_equal(nil, Levenshtein.normalized_distance("foo", "foobar", 0.30))
+  end
+  def test_same_head_and_or_tail
+    assert_equal(3, Levenshtein.distance("ab123cd", "abxyzcd"))
+    assert_equal(3, Levenshtein.distance("ab123", "abxyz"))
+    assert_equal(3, Levenshtein.distance("123cd", "xyzcd"))
+    assert_equal(5, Levenshtein.distance("123cd123", "123"))
+    assert_in_delta(0.42, Levenshtein.normalized_distance("ab123cd", "abxyzcd"), 0.01)
+    assert_in_delta(0.6, Levenshtein.normalized_distance("ab123", "abxyz"), 0.01)
+    assert_in_delta(0.6, Levenshtein.normalized_distance("123cd", "xyzcd"), 0.01)
+    assert_in_delta(0.625, Levenshtein.normalized_distance("123cd123", "123"), 0.01)
+  end
+end
+class TestLevenshteinArray < Test::Unit::TestCase
+  def test_erik_veenstra
+    x	= lambda{|s| s.scan(/./).collect{|e| Levenshtein::TestElement.new(e)}}
+    assert_equal(7, Levenshtein.distance(x["erik"], x["veenstra"]))
+  end
+end
+class TestLevenshteinArrayOfStrings < Test::Unit::TestCase
+  def test_erik_veenstra
+    x	= lambda{|s| s.scan(/./)}
+    assert_equal(7, Levenshtein.distance(x["erik"], x["veenstra"]))
+  end
+end
+class TestLevenshteinGeneric < Test::Unit::TestCase
+  def test_erik_veenstra
+    x	= lambda{|s| Levenshtein::TestSequence.new(s.scan(/./).collect{|e| Levenshtein::TestElement.new(e)})}
+    assert_equal(7, Levenshtein.distance(x["erik"], x["veenstra"]))
+  end
+end
+class TestLevenshteinSlow < Test::Unit::TestCase
+  def test_erik_veenstra
+    assert_equal(7, Levenshtein.levenshtein_distance_slow("erik", "veenstra", nil))
+  end
+  def test_empty_sequence
+    assert_equal(0, Levenshtein.levenshtein_distance_slow("", "", nil))
+    assert_equal(3, Levenshtein.levenshtein_distance_slow("", "foo", nil))
+  end
+  def test_same_sequence
+    assert_equal(0, Levenshtein.levenshtein_distance_slow("", "", nil))
+    assert_equal(0, Levenshtein.levenshtein_distance_slow("foo", "foo", nil))
+  end
+  def test_threshold
+    assert_equal(3, Levenshtein.levenshtein_distance_slow("foo", "foobar", nil))
+    assert_equal(nil, Levenshtein.levenshtein_distance_slow("foo", "foobar", 2))
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,64 @@
+--- !ruby/object:Gem::Specification
+name: levenshtein-19
+version: !ruby/object:Gem::Version
+  version: 0.3.0
+  prerelease:
+platform: ruby
+authors:
+- Erik Veenstra
+- Ryan Fitzgerald
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2011-11-21 00:00:00.000000000Z
+dependencies: []
+description: Calculates the Levenshtein distance between two byte strings.
+email:
+- rwfitzge@gmail.com
+executables: []
+extensions:
+- ext/levenshtein/extconf.rb
+extra_rdoc_files: []
+files:
+- .gitignore
+- CHANGELOG
+- Gemfile
+- LICENSE
+- README
+- Rakefile
+- ext/levenshtein/extconf.rb
+- ext/levenshtein/levenshtein_array.c
+- ext/levenshtein/levenshtein_array_of_strings.c
+- ext/levenshtein/levenshtein_fast.c
+- ext/levenshtein/levenshtein_generic.c
+- ext/levenshtein/levenshtein_string.c
+- levenshtein-19.gemspec
+- lib/levenshtein.rb
+- lib/levenshtein/version.rb
+- test/test.rb
+homepage: http://github.com/rwfitzge/levenshtein-19
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project: levenshtein-19
+rubygems_version: 1.8.11
+signing_key:
+specification_version: 3
+summary: Calculates the Levenshtein distance between two byte strings.
+test_files:
+- test/test.rb