levenshtein 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,3 +1,15 @@
1
+ 0.2.0 (11-07-2009)
2
+
3
+ * Return 0 instead of 0.0 in case of empty strings.
4
+
5
+ * Added specific support for arrays.
6
+
7
+ * Added specific support for arrays of strings.
8
+
9
+ * Added generic support for all (?) kind of sequences.
10
+
11
+ * Moved a lot of code to the C world.
12
+
1
13
  0.1.1 (06-10-2008)
2
14
 
3
15
  * If one of the strings was both the begin and the end of the
data/README CHANGED
@@ -1,8 +1,12 @@
1
1
  The Levenshtein distance is a metric for measuring the amount of difference
2
2
  between two sequences (i.e., the so called edit distance). The Levenshtein
3
- distance between two strings is given by the minimum number of operations
4
- needed to transform one string into the other, where an operation is an
5
- insertion, deletion, or substitution of a single character.
3
+ distance between two sequences is given by the minimum number of operations
4
+ needed to transform one sequence into the other, where an operation is an
5
+ insertion, deletion, or substitution of a single element.
6
+
7
+ The two sequences can be two strings, two arrays, or two other objects.
8
+ Strings, arrays and arrays of strings are handled with optimized (very fast) C
9
+ code. All other sequences are handled with generic (fast) C code.
6
10
 
7
11
  More information about the Levenshtein distance algorithm:
8
12
  http://en.wikipedia.org/wiki/Levenshtein_distance .
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.1
1
+ 0.2.0
@@ -2,4 +2,9 @@ require "mkmf"
2
2
 
3
3
  dir_config("levenshtein")
4
4
 
5
- create_makefile("levenshtein/levenshtein_c")
5
+ have_library("levenshtein_array")
6
+ have_library("levenshtein_array_of_strings")
7
+ have_library("levenshtein_generic")
8
+ have_library("levenshtein_string")
9
+
10
+ create_makefile("levenshtein/levenshtein_fast")
@@ -0,0 +1,127 @@
1
+ #include "ruby.h"
2
+
3
+ VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
+ int threshold;
5
+ int l1, l2;
6
+ int *prev_row, *curr_row;
7
+ int col, row;
8
+ int curr_row_min, result;
9
+ int offset;
10
+
11
+ ID id_eql = rb_intern("==");
12
+
13
+ /* Get the sizes of both arrays. */
14
+
15
+ l1 = RARRAY(rb_o1)->len;
16
+ l2 = RARRAY(rb_o2)->len;
17
+
18
+ /* Convert Ruby's threshold to C's threshold. */
19
+
20
+ if (!NIL_P(rb_threshold)) {
21
+ threshold = FIX2INT(rb_threshold);
22
+ } else {
23
+ threshold = -1;
24
+ }
25
+
26
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
27
+
28
+ offset = 0;
29
+ while RTEST(rb_funcall(rb_ary_entry(rb_o1, offset), id_eql, 1, rb_ary_entry(rb_o2, offset))) {
30
+ offset++;
31
+ }
32
+
33
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
34
+
35
+ while ((l1-1 > offset) && (l2-1 > offset) && RTEST(rb_funcall(rb_ary_entry(rb_o1, l1-1), id_eql, 1, rb_ary_entry(rb_o2, l2-1)))) {
36
+ l1--;
37
+ l2--;
38
+ }
39
+
40
+ l1 -= offset;
41
+ l2 -= offset;
42
+
43
+ /* The Levenshtein algorithm itself. */
44
+
45
+ /* s1= */
46
+ /* ERIK */
47
+ /* */
48
+ /* 01234 */
49
+ /* s2=V 11234 */
50
+ /* E 21234 */
51
+ /* E 32234 */
52
+ /* N 43334 <- prev_row */
53
+ /* S 54444 <- curr_row */
54
+ /* T 65555 */
55
+ /* R 76566 */
56
+ /* A 87667 */
57
+
58
+ /* Allocate memory for both rows */
59
+
60
+ prev_row = ALLOC_N(int, l1+1);
61
+ curr_row = ALLOC_N(int, l1+1);
62
+
63
+ if ((prev_row == NULL) || (curr_row == NULL)) {
64
+ rb_raise(rb_eNoMemError, "out of memory");
65
+ }
66
+
67
+ /* Initialize the current row. */
68
+
69
+ for (col=0; col<=l1; col++) {
70
+ curr_row[col] = col;
71
+ }
72
+
73
+ for (row=1; row<=l2; row++) {
74
+ /* Copy the current row to the previous row. */
75
+
76
+ memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
77
+
78
+ /* Calculate the values of the current row. */
79
+
80
+ curr_row[0] = row;
81
+ curr_row_min = row;
82
+
83
+ for (col=1; col<=l1; col++) {
84
+ /* Equal (cost=0) or substitution (cost=1). */
85
+
86
+ curr_row[col] = prev_row[col-1] + (RTEST(rb_funcall(rb_ary_entry(rb_o1, offset+col-1), id_eql, 1, rb_ary_entry(rb_o2, offset+row-1))) ? 0 : 1);
87
+
88
+ /* Insertion if it's cheaper than substitution. */
89
+
90
+ if (prev_row[col]+1 < curr_row[col]) {
91
+ curr_row[col] = prev_row[col]+1;
92
+ }
93
+
94
+ /* Deletion if it's cheaper than substitution. */
95
+
96
+ if (curr_row[col-1]+1 < curr_row[col]) {
97
+ curr_row[col] = curr_row[col-1]+1;
98
+ }
99
+
100
+ /* Keep track of the minimum value on this row. */
101
+
102
+ if (curr_row[col] < curr_row_min) {
103
+ curr_row_min = curr_row[col];
104
+ }
105
+ }
106
+
107
+ /* Return nil as soon as we exceed the threshold. */
108
+
109
+ if (threshold > -1 && curr_row_min >= threshold) {
110
+ free(prev_row);
111
+ free(curr_row);
112
+
113
+ return Qnil;
114
+ }
115
+ }
116
+
117
+ /* The result is the last value on the last row. */
118
+
119
+ result = curr_row[l1];
120
+
121
+ free(prev_row);
122
+ free(curr_row);
123
+
124
+ /* Return the Ruby version of the result. */
125
+
126
+ return INT2FIX(result);
127
+ }
@@ -0,0 +1,125 @@
1
+ #include "ruby.h"
2
+
3
+ VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
+ int threshold;
5
+ int l1, l2;
6
+ int *prev_row, *curr_row;
7
+ int col, row;
8
+ int curr_row_min, result;
9
+ int offset;
10
+
11
+ /* Get the sizes of both arrays. */
12
+
13
+ l1 = RARRAY(rb_o1)->len;
14
+ l2 = RARRAY(rb_o2)->len;
15
+
16
+ /* Convert Ruby's threshold to C's threshold. */
17
+
18
+ if (!NIL_P(rb_threshold)) {
19
+ threshold = FIX2INT(rb_threshold);
20
+ } else {
21
+ threshold = -1;
22
+ }
23
+
24
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
25
+
26
+ offset = 0;
27
+ while (rb_str_cmp(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)) == 0) {
28
+ offset++;
29
+ }
30
+
31
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
32
+
33
+ while ((l1-1 > offset) && (l2-1 > offset) && (rb_str_cmp(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)) == 0 )) {
34
+ l1--;
35
+ l2--;
36
+ }
37
+
38
+ l1 -= offset;
39
+ l2 -= offset;
40
+
41
+ /* The Levenshtein algorithm itself. */
42
+
43
+ /* s1= */
44
+ /* ERIK */
45
+ /* */
46
+ /* 01234 */
47
+ /* s2=V 11234 */
48
+ /* E 21234 */
49
+ /* E 32234 */
50
+ /* N 43334 <- prev_row */
51
+ /* S 54444 <- curr_row */
52
+ /* T 65555 */
53
+ /* R 76566 */
54
+ /* A 87667 */
55
+
56
+ /* Allocate memory for both rows */
57
+
58
+ prev_row = ALLOC_N(int, l1+1);
59
+ curr_row = ALLOC_N(int, l1+1);
60
+
61
+ if ((prev_row == NULL) || (curr_row == NULL)) {
62
+ rb_raise(rb_eNoMemError, "out of memory");
63
+ }
64
+
65
+ /* Initialize the current row. */
66
+
67
+ for (col=0; col<=l1; col++) {
68
+ curr_row[col] = col;
69
+ }
70
+
71
+ for (row=1; row<=l2; row++) {
72
+ /* Copy the current row to the previous row. */
73
+
74
+ memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
75
+
76
+ /* Calculate the values of the current row. */
77
+
78
+ curr_row[0] = row;
79
+ curr_row_min = row;
80
+
81
+ for (col=1; col<=l1; col++) {
82
+ /* Equal (cost=0) or substitution (cost=1). */
83
+
84
+ curr_row[col] = prev_row[col-1] + ((rb_str_cmp(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1)) == 0) ? 0 : 1);
85
+
86
+ /* Insertion if it's cheaper than substitution. */
87
+
88
+ if (prev_row[col]+1 < curr_row[col]) {
89
+ curr_row[col] = prev_row[col]+1;
90
+ }
91
+
92
+ /* Deletion if it's cheaper than substitution. */
93
+
94
+ if (curr_row[col-1]+1 < curr_row[col]) {
95
+ curr_row[col] = curr_row[col-1]+1;
96
+ }
97
+
98
+ /* Keep track of the minimum value on this row. */
99
+
100
+ if (curr_row[col] < curr_row_min) {
101
+ curr_row_min = curr_row[col];
102
+ }
103
+ }
104
+
105
+ /* Return nil as soon as we exceed the threshold. */
106
+
107
+ if (threshold > -1 && curr_row_min >= threshold) {
108
+ free(prev_row);
109
+ free(curr_row);
110
+
111
+ return Qnil;
112
+ }
113
+ }
114
+
115
+ /* The result is the last value on the last row. */
116
+
117
+ result = curr_row[l1];
118
+
119
+ free(prev_row);
120
+ free(curr_row);
121
+
122
+ /* Return the Ruby version of the result. */
123
+
124
+ return INT2FIX(result);
125
+ }
@@ -0,0 +1,21 @@
1
+ #include "ruby.h"
2
+
3
+ VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
+ if ((TYPE(rb_o1) == T_STRING) && (TYPE(rb_o2)) == T_STRING) {
5
+ return levenshtein_distance_string(self, rb_o1, rb_o2, rb_threshold);
6
+ } else if ((TYPE(rb_o1) == T_ARRAY) && (TYPE(rb_o2)) == T_ARRAY) {
7
+ if ((TYPE(rb_ary_entry(rb_o1, 0)) == T_STRING) && (TYPE(rb_ary_entry(rb_o2, 0))) == T_STRING) {
8
+ return levenshtein_distance_array_of_strings(self, rb_o1, rb_o2, rb_threshold);
9
+ } else {
10
+ return levenshtein_distance_array(self, rb_o1, rb_o2, rb_threshold);
11
+ }
12
+ } else {
13
+ return levenshtein_distance_generic(self, rb_o1, rb_o2, rb_threshold);
14
+ }
15
+ }
16
+
17
+ void Init_levenshtein_fast() {
18
+ VALUE mLevenshtein = rb_define_module("Levenshtein");
19
+
20
+ rb_define_singleton_method(mLevenshtein, "levenshtein_distance_fast" , levenshtein_distance_fast, 3);
21
+ }
@@ -0,0 +1,129 @@
1
+ #include "ruby.h"
2
+
3
+ VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
+ int threshold;
5
+ int l1, l2;
6
+ int *prev_row, *curr_row;
7
+ int col, row;
8
+ int curr_row_min, result;
9
+ int offset;
10
+
11
+ ID id_length = rb_intern("length");
12
+ ID id_get = rb_intern("[]");
13
+ ID id_equal = rb_intern("==");
14
+
15
+ /* Get the sizes of both sequences. */
16
+
17
+ l1 = FIX2INT(rb_funcall(rb_o1, id_length, 0));
18
+ l2 = FIX2INT(rb_funcall(rb_o2, id_length, 0));
19
+
20
+ /* Convert Ruby's threshold to C's threshold. */
21
+
22
+ if (!NIL_P(rb_threshold)) {
23
+ threshold = FIX2INT(rb_threshold);
24
+ } else {
25
+ threshold = -1;
26
+ }
27
+
28
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
29
+
30
+ offset = 0;
31
+ while RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(offset)))) {
32
+ offset++;
33
+ }
34
+
35
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
36
+
37
+ while ((l1-1 > offset) && (l2-1 > offset) && RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(l1-1)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(l2-1))))) {
38
+ l1--;
39
+ l2--;
40
+ }
41
+
42
+ l1 -= offset;
43
+ l2 -= offset;
44
+
45
+ /* The Levenshtein algorithm itself. */
46
+
47
+ /* s1= */
48
+ /* ERIK */
49
+ /* */
50
+ /* 01234 */
51
+ /* s2=V 11234 */
52
+ /* E 21234 */
53
+ /* E 32234 */
54
+ /* N 43334 <- prev_row */
55
+ /* S 54444 <- curr_row */
56
+ /* T 65555 */
57
+ /* R 76566 */
58
+ /* A 87667 */
59
+
60
+ /* Allocate memory for both rows */
61
+
62
+ prev_row = ALLOC_N(int, l1+1);
63
+ curr_row = ALLOC_N(int, l1+1);
64
+
65
+ if ((prev_row == NULL) || (curr_row == NULL)) {
66
+ rb_raise(rb_eNoMemError, "out of memory");
67
+ }
68
+
69
+ /* Initialize the current row. */
70
+
71
+ for (col=0; col<=l1; col++) {
72
+ curr_row[col] = col;
73
+ }
74
+
75
+ for (row=1; row<=l2; row++) {
76
+ /* Copy the current row to the previous row. */
77
+
78
+ memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
79
+
80
+ /* Calculate the values of the current row. */
81
+
82
+ curr_row[0] = row;
83
+ curr_row_min = row;
84
+
85
+ for (col=1; col<=l1; col++) {
86
+ /* Equal (cost=0) or substitution (cost=1). */
87
+
88
+ curr_row[col] = prev_row[col-1] + (RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset+col-1)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(offset+row-1)))) ? 0 : 1);
89
+
90
+ /* Insertion if it's cheaper than substitution. */
91
+
92
+ if (prev_row[col]+1 < curr_row[col]) {
93
+ curr_row[col] = prev_row[col]+1;
94
+ }
95
+
96
+ /* Deletion if it's cheaper than substitution. */
97
+
98
+ if (curr_row[col-1]+1 < curr_row[col]) {
99
+ curr_row[col] = curr_row[col-1]+1;
100
+ }
101
+
102
+ /* Keep track of the minimum value on this row. */
103
+
104
+ if (curr_row[col] < curr_row_min) {
105
+ curr_row_min = curr_row[col];
106
+ }
107
+ }
108
+
109
+ /* Return nil as soon as we exceed the threshold. */
110
+
111
+ if (threshold > -1 && curr_row_min >= threshold) {
112
+ free(prev_row);
113
+ free(curr_row);
114
+
115
+ return Qnil;
116
+ }
117
+ }
118
+
119
+ /* The result is the last value on the last row. */
120
+
121
+ result = curr_row[l1];
122
+
123
+ free(prev_row);
124
+ free(curr_row);
125
+
126
+ /* Return the Ruby version of the result. */
127
+
128
+ return INT2FIX(result);
129
+ }
@@ -1,25 +1,25 @@
1
1
  #include "ruby.h"
2
2
 
3
- static VALUE levenshtein_distance_part2(VALUE self, VALUE rb_s1, VALUE rb_s2, VALUE rb_threshold) {
4
- VALUE rb_s3;
3
+ VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
5
4
  int threshold;
6
- int l1, l2, l3;
7
- char *s1, *s2, *s3;
5
+ int l1, l2;
8
6
  int *prev_row, *curr_row;
9
7
  int col, row;
10
8
  int curr_row_min, result;
9
+ int offset;
10
+ char *s1, *s2;
11
11
 
12
12
  /* Convert Ruby's s1 to C's s1. */
13
13
 
14
- rb_s1 = StringValue(rb_s1);
15
- s1 = RSTRING(rb_s1)->ptr;
16
- l1 = RSTRING(rb_s1)->len;
14
+ rb_o1 = StringValue(rb_o1);
15
+ s1 = RSTRING(rb_o1)->ptr;
16
+ l1 = RSTRING(rb_o1)->len;
17
17
 
18
18
  /* Convert Ruby's s2 to C's s2. */
19
19
 
20
- rb_s2 = StringValue(rb_s2);
21
- s2 = RSTRING(rb_s2)->ptr;
22
- l2 = RSTRING(rb_s2)->len;
20
+ rb_o2 = StringValue(rb_o2);
21
+ s2 = RSTRING(rb_o2)->ptr;
22
+ l2 = RSTRING(rb_o2)->len;
23
23
 
24
24
  /* Convert Ruby's threshold to C's threshold. */
25
25
 
@@ -29,7 +29,24 @@ static VALUE levenshtein_distance_part2(VALUE self, VALUE rb_s1, VALUE rb_s2, VA
29
29
  threshold = -1;
30
30
  }
31
31
 
32
- /* The Levenshtein Algorithm itself. */
32
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
33
+
34
+ offset = 0;
35
+ while (s1[offset] == s2[offset]) {
36
+ offset++;
37
+ }
38
+
39
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
40
+
41
+ while ((l1-1 > offset) && (l2-1 > offset) && (s1[l1-1] == s2[l2-1])) {
42
+ l1--;
43
+ l2--;
44
+ }
45
+
46
+ l1 -= offset;
47
+ l2 -= offset;
48
+
49
+ /* The Levenshtein algorithm itself. */
33
50
 
34
51
  /* s1= */
35
52
  /* ERIK */
@@ -43,7 +60,7 @@ static VALUE levenshtein_distance_part2(VALUE self, VALUE rb_s1, VALUE rb_s2, VA
43
60
  /* T 65555 */
44
61
  /* R 76566 */
45
62
  /* A 87667 */
46
-
63
+
47
64
  /* Allocate memory for both rows */
48
65
 
49
66
  prev_row = ALLOC_N(int, l1+1);
@@ -70,9 +87,9 @@ static VALUE levenshtein_distance_part2(VALUE self, VALUE rb_s1, VALUE rb_s2, VA
70
87
  curr_row_min = row;
71
88
 
72
89
  for (col=1; col<=l1; col++) {
73
- /* Equal (cost=0) or Substitution (cost=1). */
90
+ /* Equal (cost=0) or substitution (cost=1). */
74
91
 
75
- curr_row[col] = prev_row[col-1] + ((s1[col-1] == s2[row-1]) ? 0 : 1);
92
+ curr_row[col] = prev_row[col-1] + ((s1[offset+col-1] == s2[offset+row-1]) ? 0 : 1);
76
93
 
77
94
  /* Insertion if it's cheaper than substitution. */
78
95
 
@@ -114,9 +131,3 @@ static VALUE levenshtein_distance_part2(VALUE self, VALUE rb_s1, VALUE rb_s2, VA
114
131
 
115
132
  return INT2FIX(result);
116
133
  }
117
-
118
- void Init_levenshtein_c() {
119
- VALUE mLevenshtein = rb_define_module("Levenshtein");
120
-
121
- rb_define_singleton_method(mLevenshtein, "distance_part2_fast" , levenshtein_distance_part2, 3);
122
- }
@@ -1,30 +1,34 @@
1
1
  begin
2
- require "levenshtein/levenshtein_c"
2
+ require "levenshtein/levenshtein_fast" # If compiled by RubyGems.
3
3
  rescue LoadError
4
4
  begin
5
- require "levenshtein_c"
5
+ require "levenshtein_fast" # If compiled by the build script.
6
6
  rescue LoadError
7
- $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.distance_part2. Using the slow Ruby version instead."
7
+ $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.distance. Using the much slower Ruby version instead."
8
8
  end
9
9
  end
10
10
 
11
- # The Levenshtein distance is a metric for measuring the amount of difference
12
- # between two sequences (i.e., the so called edit distance). The Levenshtein
13
- # distance between two strings is given by the minimum number of operations
14
- # needed to transform one string into the other, where an operation is an
15
- # insertion, deletion, or substitution of a single character.
11
+ # The Levenshtein distance is a metric for measuring the amount
12
+ # of difference between two sequences (i.e., the so called edit
13
+ # distance). The Levenshtein distance between two sequences is
14
+ # given by the minimum number of operations needed to transform
15
+ # one sequence into the other, where an operation is an
16
+ # insertion, deletion, or substitution of a single element.
16
17
  #
17
18
  # More information about the Levenshtein distance algorithm:
18
19
  # http://en.wikipedia.org/wiki/Levenshtein_distance .
19
20
 
20
21
  module Levenshtein
21
- # Returns the Levenshtein distance as a number between 0.0 and 1.0.
22
- # It's basically the Levenshtein distance divided by the length of the longest string.
22
+ VERSION = "0.2.0"
23
+
24
+ # Returns the Levenshtein distance as a number between 0.0 and
25
+ # 1.0. It's basically the Levenshtein distance divided by the
26
+ # length of the longest sequence.
23
27
 
24
28
  def self.normalized_distance(s1, s2, threshold=nil)
25
29
  s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
26
30
 
27
- if s2.empty?
31
+ if s2.length == 0
28
32
  0.0 # Since s1.length < s2.length, s1 must be empty as well.
29
33
  else
30
34
  if threshold
@@ -39,46 +43,49 @@ module Levenshtein
39
43
  end
40
44
  end
41
45
 
42
- # Returns the Levenshtein distance between two byte strings.
46
+ # Returns the Levenshtein distance between two sequences.
47
+ #
48
+ # The two sequences can be two strings, two arrays, or two other
49
+ # objects. Strings, arrays and arrays of strings are handled with
50
+ # optimized (very fast) C code. All other sequences are handled
51
+ # with generic (fast) C code.
52
+ #
53
+ # The sequences should respond to :length and :[] and all objects
54
+ # in the sequences (as returned by []) should response to :==.
43
55
 
44
56
  def self.distance(s1, s2, threshold=nil)
45
57
  s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
46
58
 
47
59
  # Handle some basic circumstances.
48
60
 
49
- return 0.0 if s1 == s2
50
- return s2.length if s1.empty?
51
- return nil if threshold and (s2.length-s1.length) >= threshold
52
- return nil if threshold and (s1.scan(/./) - s2.scan(/./)).length >= threshold
53
- return nil if threshold and (s2.scan(/./) - s1.scan(/./)).length >= threshold
54
-
55
- # Do the expensive calculation on a subset of the strings only, if possible.
61
+ return 0 if s1 == s2
62
+ return s2.length if s1.length == 0
56
63
 
57
- b = 0
58
- e1 = s1.length-1
59
- e2 = s2.length-1
64
+ if threshold
65
+ return nil if (s2.length-s1.length) >= threshold
60
66
 
61
- while s1[b, 1] == s2[b, 1]
62
- b += 1
63
- end
67
+ a1, a2 = nil, nil
68
+ a1, a2 = s1, s2 if s1.respond_to?(:-) and s2.respond_to?(:-)
69
+ a1, a2 = s1.scan(/./), s2.scan(/./) if s1.respond_to?(:scan) and s2.respond_to?(:scan)
64
70
 
65
- while s1[e1, 1] == s2[e2, 1] and e1 > b and e2 > b
66
- e1 -= 1
67
- e2 -= 1
71
+ if a1 and a2
72
+ return nil if (a1-a2).length >= threshold
73
+ return nil if (a2-a1).length >= threshold
74
+ end
68
75
  end
69
76
 
70
- distance_part2(s1[b..e1], s2[b..e2], threshold)
77
+ distance_fast_or_slow(s1, s2, threshold)
71
78
  end
72
79
 
73
- def self.distance_part2(s1, s2, threshold) # :nodoc:
74
- if respond_to?(:distance_part2_fast)
75
- distance_part2_fast(s1, s2, threshold) # Implemented in C.
80
+ def self.distance_fast_or_slow(s1, s2, threshold) # :nodoc:
81
+ if respond_to?(:levenshtein_distance_fast)
82
+ levenshtein_distance_fast(s1, s2, threshold) # Implemented in C.
76
83
  else
77
- distance_part2_slow(s1, s2, threshold) # Implemented in Ruby.
84
+ levenshtein_distance_slow(s1, s2, threshold) # Implemented in Ruby.
78
85
  end
79
86
  end
80
87
 
81
- def self.distance_part2_slow(s1, s2, threshold) # :nodoc:
88
+ def self.levenshtein_distance_slow(s1, s2, threshold) # :nodoc:
82
89
  row = (0..s1.length).to_a
83
90
 
84
91
  1.upto(s2.length) do |y|
@@ -89,8 +96,10 @@ module Levenshtein
89
96
  row[x] = [prow[x]+1, row[x-1]+1, prow[x-1]+(s1[x-1]==s2[y-1] ? 0 : 1)].min
90
97
  end
91
98
 
92
- # Stop analysing this string as soon as the best possible result for this string is bigger than the best result so far.
93
- # (The minimum value in the next row will be equal to or greater than the minimum value in this row.)
99
+ # Stop analysing this sequence as soon as the best possible
100
+ # result for this sequence is bigger than the best result so far.
101
+ # (The minimum value in the next row will be equal to or greater
102
+ # than the minimum value in this row.)
94
103
 
95
104
  return nil if threshold and row.min >= threshold
96
105
  end
@@ -1,7 +1,35 @@
1
1
  require "test/unit"
2
2
  require "levenshtein"
3
3
 
4
- class TestLevenshtein < Test::Unit::TestCase
4
+ module Levenshtein
5
+ class TestSequence
6
+ def initialize(o)
7
+ @sequence = o
8
+ end
9
+
10
+ def length
11
+ @sequence.length
12
+ end
13
+
14
+ def [](pos)
15
+ @sequence[pos]
16
+ end
17
+ end
18
+
19
+ class TestElement
20
+ attr_reader :object
21
+
22
+ def initialize(o)
23
+ @object = o
24
+ end
25
+
26
+ def ==(other)
27
+ @object == other.object
28
+ end
29
+ end
30
+ end
31
+
32
+ class TestLevenshteinString < Test::Unit::TestCase
5
33
  def test_erik_veenstra
6
34
  assert_equal(7, Levenshtein.distance("erik", "veenstra"))
7
35
  assert_equal(7, Levenshtein.distance("veenstra", "erik"))
@@ -30,9 +58,11 @@ class TestLevenshtein < Test::Unit::TestCase
30
58
 
31
59
  def test_threshold
32
60
  assert_equal(3, Levenshtein.distance("foo", "foobar"))
61
+ assert_equal(3, Levenshtein.distance("foo", "foobar", 4))
33
62
  assert_equal(nil, Levenshtein.distance("foo", "foobar", 2))
34
63
 
35
64
  assert_in_delta(0.5, Levenshtein.normalized_distance("foo", "foobar"), 0.01)
65
+ assert_in_delta(0.5, Levenshtein.normalized_distance("foo", "foobar", 0.66), 0.01)
36
66
  assert_equal(nil, Levenshtein.normalized_distance("foo", "foobar", 0.30))
37
67
  end
38
68
 
@@ -45,47 +75,51 @@ class TestLevenshtein < Test::Unit::TestCase
45
75
  assert_in_delta(0.42, Levenshtein.normalized_distance("ab123cd", "abxyzcd"), 0.01)
46
76
  assert_in_delta(0.6, Levenshtein.normalized_distance("ab123", "abxyz"), 0.01)
47
77
  assert_in_delta(0.6, Levenshtein.normalized_distance("123cd", "xyzcd"), 0.01)
78
+ assert_in_delta(0.625, Levenshtein.normalized_distance("123cd123", "123"), 0.01)
48
79
  end
49
80
  end
50
81
 
51
- class TestLevenshteinPart2Slow < Test::Unit::TestCase
82
+ class TestLevenshteinArray < Test::Unit::TestCase
52
83
  def test_erik_veenstra
53
- assert_equal(7, Levenshtein.distance_part2_slow("erik", "veenstra", nil))
54
- end
84
+ x = lambda{|s| s.scan(/./).collect{|e| Levenshtein::TestElement.new(e)}}
55
85
 
56
- def test_empty_string
57
- assert_equal(0, Levenshtein.distance_part2_slow("", "", nil))
58
- assert_equal(3, Levenshtein.distance_part2_slow("", "foo", nil))
86
+ assert_equal(7, Levenshtein.distance(x["erik"], x["veenstra"]))
59
87
  end
88
+ end
60
89
 
61
- def test_same_string
62
- assert_equal(0, Levenshtein.distance_part2_slow("", "", nil))
63
- assert_equal(0, Levenshtein.distance_part2_slow("foo", "foo", nil))
90
+ class TestLevenshteinArrayOfStrings < Test::Unit::TestCase
91
+ def test_erik_veenstra
92
+ x = lambda{|s| s.scan(/./)}
93
+
94
+ assert_equal(7, Levenshtein.distance(x["erik"], x["veenstra"]))
64
95
  end
96
+ end
65
97
 
66
- def test_threshold
67
- assert_equal(3, Levenshtein.distance_part2_slow("foo", "foobar", nil))
68
- assert_equal(nil, Levenshtein.distance_part2_slow("foo", "foobar", 2))
98
+ class TestLevenshteinGeneric < Test::Unit::TestCase
99
+ def test_erik_veenstra
100
+ x = lambda{|s| Levenshtein::TestSequence.new(s.scan(/./).collect{|e| Levenshtein::TestElement.new(e)})}
101
+
102
+ assert_equal(7, Levenshtein.distance(x["erik"], x["veenstra"]))
69
103
  end
70
104
  end
71
105
 
72
- class TestLevenshteinPart2Fast < Test::Unit::TestCase
106
+ class TestLevenshteinSlow < Test::Unit::TestCase
73
107
  def test_erik_veenstra
74
- assert_equal(7, Levenshtein.distance_part2_fast("erik", "veenstra", nil))
108
+ assert_equal(7, Levenshtein.levenshtein_distance_slow("erik", "veenstra", nil))
75
109
  end
76
110
 
77
- def test_empty_string
78
- assert_equal(0, Levenshtein.distance_part2_fast("", "", nil))
79
- assert_equal(3, Levenshtein.distance_part2_fast("", "foo", nil))
111
+ def test_empty_sequence
112
+ assert_equal(0, Levenshtein.levenshtein_distance_slow("", "", nil))
113
+ assert_equal(3, Levenshtein.levenshtein_distance_slow("", "foo", nil))
80
114
  end
81
115
 
82
- def test_same_string
83
- assert_equal(0, Levenshtein.distance_part2_fast("", "", nil))
84
- assert_equal(0, Levenshtein.distance_part2_fast("foo", "foo", nil))
116
+ def test_same_sequence
117
+ assert_equal(0, Levenshtein.levenshtein_distance_slow("", "", nil))
118
+ assert_equal(0, Levenshtein.levenshtein_distance_slow("foo", "foo", nil))
85
119
  end
86
120
 
87
121
  def test_threshold
88
- assert_equal(3, Levenshtein.distance_part2_fast("foo", "foobar", nil))
89
- assert_equal(nil, Levenshtein.distance_part2_fast("foo", "foobar", 2))
122
+ assert_equal(3, Levenshtein.levenshtein_distance_slow("foo", "foobar", nil))
123
+ assert_equal(nil, Levenshtein.levenshtein_distance_slow("foo", "foobar", 2))
90
124
  end
91
125
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: levenshtein
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Erik Veenstra
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-10-06 00:00:00 +02:00
12
+ date: 2009-07-11 00:00:00 +02:00
13
13
  default_executable:
14
14
  dependencies: []
15
15
 
@@ -23,15 +23,20 @@ extra_rdoc_files: []
23
23
 
24
24
  files:
25
25
  - lib/levenshtein.rb
26
- - ext/levenshtein
27
26
  - ext/levenshtein/extconf.rb
28
- - ext/levenshtein/levenshtein_c.c
27
+ - ext/levenshtein/levenshtein_array_of_strings.c
28
+ - ext/levenshtein/levenshtein_fast.c
29
+ - ext/levenshtein/levenshtein_string.c
30
+ - ext/levenshtein/levenshtein_generic.c
31
+ - ext/levenshtein/levenshtein_array.c
29
32
  - README
30
33
  - LICENSE
31
34
  - VERSION
32
35
  - CHANGELOG
33
36
  has_rdoc: true
34
37
  homepage: http://www.erikveen.dds.nl/levenshtein/index.html
38
+ licenses: []
39
+
35
40
  post_install_message:
36
41
  rdoc_options:
37
42
  - README
@@ -39,7 +44,7 @@ rdoc_options:
39
44
  - VERSION
40
45
  - CHANGELOG
41
46
  - --title
42
- - levenshtein (0.1.1)
47
+ - levenshtein (0.2.0)
43
48
  - --main
44
49
  - README
45
50
  require_paths:
@@ -59,9 +64,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
59
64
  requirements: []
60
65
 
61
66
  rubyforge_project: levenshtein
62
- rubygems_version: 1.2.0
67
+ rubygems_version: 1.3.4
63
68
  signing_key:
64
- specification_version: 2
69
+ specification_version: 3
65
70
  summary: Calculates the Levenshtein distance between two byte strings.
66
71
  test_files:
67
72
  - test/test.rb