levenshtein 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,3 +1,15 @@
1
+ 0.2.0 (11-07-2009)
2
+
3
+ * Return 0 instead of 0.0 in case of empty strings.
4
+
5
+ * Added specific support for arrays.
6
+
7
+ * Added specific support for arrays of strings.
8
+
9
+ * Added generic support for all (?) kind of sequences.
10
+
11
+ * Moved a lot of code to the C world.
12
+
1
13
  0.1.1 (06-10-2008)
2
14
 
3
15
  * If one of the strings was both the begin and the end of the
data/README CHANGED
@@ -1,8 +1,12 @@
1
1
  The Levenshtein distance is a metric for measuring the amount of difference
2
2
  between two sequences (i.e., the so called edit distance). The Levenshtein
3
- distance between two strings is given by the minimum number of operations
4
- needed to transform one string into the other, where an operation is an
5
- insertion, deletion, or substitution of a single character.
3
+ distance between two sequences is given by the minimum number of operations
4
+ needed to transform one sequence into the other, where an operation is an
5
+ insertion, deletion, or substitution of a single element.
6
+
7
+ The two sequences can be two strings, two arrays, or two other objects.
8
+ Strings, arrays and arrays of strings are handled with optimized (very fast) C
9
+ code. All other sequences are handled with generic (fast) C code.
6
10
 
7
11
  More information about the Levenshtein distance algorithm:
8
12
  http://en.wikipedia.org/wiki/Levenshtein_distance .
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.1
1
+ 0.2.0
@@ -2,4 +2,9 @@ require "mkmf"
2
2
 
3
3
  dir_config("levenshtein")
4
4
 
5
- create_makefile("levenshtein/levenshtein_c")
5
+ have_library("levenshtein_array")
6
+ have_library("levenshtein_array_of_strings")
7
+ have_library("levenshtein_generic")
8
+ have_library("levenshtein_string")
9
+
10
+ create_makefile("levenshtein/levenshtein_fast")
@@ -0,0 +1,127 @@
1
+ #include "ruby.h"
2
+
3
+ VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
+ int threshold;
5
+ int l1, l2;
6
+ int *prev_row, *curr_row;
7
+ int col, row;
8
+ int curr_row_min, result;
9
+ int offset;
10
+
11
+ ID id_eql = rb_intern("==");
12
+
13
+ /* Get the sizes of both arrays. */
14
+
15
+ l1 = RARRAY(rb_o1)->len;
16
+ l2 = RARRAY(rb_o2)->len;
17
+
18
+ /* Convert Ruby's threshold to C's threshold. */
19
+
20
+ if (!NIL_P(rb_threshold)) {
21
+ threshold = FIX2INT(rb_threshold);
22
+ } else {
23
+ threshold = -1;
24
+ }
25
+
26
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
27
+
28
+ offset = 0;
29
+ while RTEST(rb_funcall(rb_ary_entry(rb_o1, offset), id_eql, 1, rb_ary_entry(rb_o2, offset))) {
30
+ offset++;
31
+ }
32
+
33
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
34
+
35
+ while ((l1-1 > offset) && (l2-1 > offset) && RTEST(rb_funcall(rb_ary_entry(rb_o1, l1-1), id_eql, 1, rb_ary_entry(rb_o2, l2-1)))) {
36
+ l1--;
37
+ l2--;
38
+ }
39
+
40
+ l1 -= offset;
41
+ l2 -= offset;
42
+
43
+ /* The Levenshtein algorithm itself. */
44
+
45
+ /* s1= */
46
+ /* ERIK */
47
+ /* */
48
+ /* 01234 */
49
+ /* s2=V 11234 */
50
+ /* E 21234 */
51
+ /* E 32234 */
52
+ /* N 43334 <- prev_row */
53
+ /* S 54444 <- curr_row */
54
+ /* T 65555 */
55
+ /* R 76566 */
56
+ /* A 87667 */
57
+
58
+ /* Allocate memory for both rows */
59
+
60
+ prev_row = ALLOC_N(int, l1+1);
61
+ curr_row = ALLOC_N(int, l1+1);
62
+
63
+ if ((prev_row == NULL) || (curr_row == NULL)) {
64
+ rb_raise(rb_eNoMemError, "out of memory");
65
+ }
66
+
67
+ /* Initialize the current row. */
68
+
69
+ for (col=0; col<=l1; col++) {
70
+ curr_row[col] = col;
71
+ }
72
+
73
+ for (row=1; row<=l2; row++) {
74
+ /* Copy the current row to the previous row. */
75
+
76
+ memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
77
+
78
+ /* Calculate the values of the current row. */
79
+
80
+ curr_row[0] = row;
81
+ curr_row_min = row;
82
+
83
+ for (col=1; col<=l1; col++) {
84
+ /* Equal (cost=0) or substitution (cost=1). */
85
+
86
+ curr_row[col] = prev_row[col-1] + (RTEST(rb_funcall(rb_ary_entry(rb_o1, offset+col-1), id_eql, 1, rb_ary_entry(rb_o2, offset+row-1))) ? 0 : 1);
87
+
88
+ /* Insertion if it's cheaper than substitution. */
89
+
90
+ if (prev_row[col]+1 < curr_row[col]) {
91
+ curr_row[col] = prev_row[col]+1;
92
+ }
93
+
94
+ /* Deletion if it's cheaper than substitution. */
95
+
96
+ if (curr_row[col-1]+1 < curr_row[col]) {
97
+ curr_row[col] = curr_row[col-1]+1;
98
+ }
99
+
100
+ /* Keep track of the minimum value on this row. */
101
+
102
+ if (curr_row[col] < curr_row_min) {
103
+ curr_row_min = curr_row[col];
104
+ }
105
+ }
106
+
107
+ /* Return nil as soon as we exceed the threshold. */
108
+
109
+ if (threshold > -1 && curr_row_min >= threshold) {
110
+ free(prev_row);
111
+ free(curr_row);
112
+
113
+ return Qnil;
114
+ }
115
+ }
116
+
117
+ /* The result is the last value on the last row. */
118
+
119
+ result = curr_row[l1];
120
+
121
+ free(prev_row);
122
+ free(curr_row);
123
+
124
+ /* Return the Ruby version of the result. */
125
+
126
+ return INT2FIX(result);
127
+ }
@@ -0,0 +1,125 @@
1
+ #include "ruby.h"
2
+
3
+ VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
+ int threshold;
5
+ int l1, l2;
6
+ int *prev_row, *curr_row;
7
+ int col, row;
8
+ int curr_row_min, result;
9
+ int offset;
10
+
11
+ /* Get the sizes of both arrays. */
12
+
13
+ l1 = RARRAY(rb_o1)->len;
14
+ l2 = RARRAY(rb_o2)->len;
15
+
16
+ /* Convert Ruby's threshold to C's threshold. */
17
+
18
+ if (!NIL_P(rb_threshold)) {
19
+ threshold = FIX2INT(rb_threshold);
20
+ } else {
21
+ threshold = -1;
22
+ }
23
+
24
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
25
+
26
+ offset = 0;
27
+ while (rb_str_cmp(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)) == 0) {
28
+ offset++;
29
+ }
30
+
31
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
32
+
33
+ while ((l1-1 > offset) && (l2-1 > offset) && (rb_str_cmp(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)) == 0 )) {
34
+ l1--;
35
+ l2--;
36
+ }
37
+
38
+ l1 -= offset;
39
+ l2 -= offset;
40
+
41
+ /* The Levenshtein algorithm itself. */
42
+
43
+ /* s1= */
44
+ /* ERIK */
45
+ /* */
46
+ /* 01234 */
47
+ /* s2=V 11234 */
48
+ /* E 21234 */
49
+ /* E 32234 */
50
+ /* N 43334 <- prev_row */
51
+ /* S 54444 <- curr_row */
52
+ /* T 65555 */
53
+ /* R 76566 */
54
+ /* A 87667 */
55
+
56
+ /* Allocate memory for both rows */
57
+
58
+ prev_row = ALLOC_N(int, l1+1);
59
+ curr_row = ALLOC_N(int, l1+1);
60
+
61
+ if ((prev_row == NULL) || (curr_row == NULL)) {
62
+ rb_raise(rb_eNoMemError, "out of memory");
63
+ }
64
+
65
+ /* Initialize the current row. */
66
+
67
+ for (col=0; col<=l1; col++) {
68
+ curr_row[col] = col;
69
+ }
70
+
71
+ for (row=1; row<=l2; row++) {
72
+ /* Copy the current row to the previous row. */
73
+
74
+ memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
75
+
76
+ /* Calculate the values of the current row. */
77
+
78
+ curr_row[0] = row;
79
+ curr_row_min = row;
80
+
81
+ for (col=1; col<=l1; col++) {
82
+ /* Equal (cost=0) or substitution (cost=1). */
83
+
84
+ curr_row[col] = prev_row[col-1] + ((rb_str_cmp(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1)) == 0) ? 0 : 1);
85
+
86
+ /* Insertion if it's cheaper than substitution. */
87
+
88
+ if (prev_row[col]+1 < curr_row[col]) {
89
+ curr_row[col] = prev_row[col]+1;
90
+ }
91
+
92
+ /* Deletion if it's cheaper than substitution. */
93
+
94
+ if (curr_row[col-1]+1 < curr_row[col]) {
95
+ curr_row[col] = curr_row[col-1]+1;
96
+ }
97
+
98
+ /* Keep track of the minimum value on this row. */
99
+
100
+ if (curr_row[col] < curr_row_min) {
101
+ curr_row_min = curr_row[col];
102
+ }
103
+ }
104
+
105
+ /* Return nil as soon as we exceed the threshold. */
106
+
107
+ if (threshold > -1 && curr_row_min >= threshold) {
108
+ free(prev_row);
109
+ free(curr_row);
110
+
111
+ return Qnil;
112
+ }
113
+ }
114
+
115
+ /* The result is the last value on the last row. */
116
+
117
+ result = curr_row[l1];
118
+
119
+ free(prev_row);
120
+ free(curr_row);
121
+
122
+ /* Return the Ruby version of the result. */
123
+
124
+ return INT2FIX(result);
125
+ }
@@ -0,0 +1,21 @@
1
+ #include "ruby.h"
2
+
3
+ VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
+ if ((TYPE(rb_o1) == T_STRING) && (TYPE(rb_o2)) == T_STRING) {
5
+ return levenshtein_distance_string(self, rb_o1, rb_o2, rb_threshold);
6
+ } else if ((TYPE(rb_o1) == T_ARRAY) && (TYPE(rb_o2)) == T_ARRAY) {
7
+ if ((TYPE(rb_ary_entry(rb_o1, 0)) == T_STRING) && (TYPE(rb_ary_entry(rb_o2, 0))) == T_STRING) {
8
+ return levenshtein_distance_array_of_strings(self, rb_o1, rb_o2, rb_threshold);
9
+ } else {
10
+ return levenshtein_distance_array(self, rb_o1, rb_o2, rb_threshold);
11
+ }
12
+ } else {
13
+ return levenshtein_distance_generic(self, rb_o1, rb_o2, rb_threshold);
14
+ }
15
+ }
16
+
17
+ void Init_levenshtein_fast() {
18
+ VALUE mLevenshtein = rb_define_module("Levenshtein");
19
+
20
+ rb_define_singleton_method(mLevenshtein, "levenshtein_distance_fast" , levenshtein_distance_fast, 3);
21
+ }
@@ -0,0 +1,129 @@
1
+ #include "ruby.h"
2
+
3
+ VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
+ int threshold;
5
+ int l1, l2;
6
+ int *prev_row, *curr_row;
7
+ int col, row;
8
+ int curr_row_min, result;
9
+ int offset;
10
+
11
+ ID id_length = rb_intern("length");
12
+ ID id_get = rb_intern("[]");
13
+ ID id_equal = rb_intern("==");
14
+
15
+ /* Get the sizes of both sequences. */
16
+
17
+ l1 = FIX2INT(rb_funcall(rb_o1, id_length, 0));
18
+ l2 = FIX2INT(rb_funcall(rb_o2, id_length, 0));
19
+
20
+ /* Convert Ruby's threshold to C's threshold. */
21
+
22
+ if (!NIL_P(rb_threshold)) {
23
+ threshold = FIX2INT(rb_threshold);
24
+ } else {
25
+ threshold = -1;
26
+ }
27
+
28
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
29
+
30
+ offset = 0;
31
+ while RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(offset)))) {
32
+ offset++;
33
+ }
34
+
35
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
36
+
37
+ while ((l1-1 > offset) && (l2-1 > offset) && RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(l1-1)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(l2-1))))) {
38
+ l1--;
39
+ l2--;
40
+ }
41
+
42
+ l1 -= offset;
43
+ l2 -= offset;
44
+
45
+ /* The Levenshtein algorithm itself. */
46
+
47
+ /* s1= */
48
+ /* ERIK */
49
+ /* */
50
+ /* 01234 */
51
+ /* s2=V 11234 */
52
+ /* E 21234 */
53
+ /* E 32234 */
54
+ /* N 43334 <- prev_row */
55
+ /* S 54444 <- curr_row */
56
+ /* T 65555 */
57
+ /* R 76566 */
58
+ /* A 87667 */
59
+
60
+ /* Allocate memory for both rows */
61
+
62
+ prev_row = ALLOC_N(int, l1+1);
63
+ curr_row = ALLOC_N(int, l1+1);
64
+
65
+ if ((prev_row == NULL) || (curr_row == NULL)) {
66
+ rb_raise(rb_eNoMemError, "out of memory");
67
+ }
68
+
69
+ /* Initialize the current row. */
70
+
71
+ for (col=0; col<=l1; col++) {
72
+ curr_row[col] = col;
73
+ }
74
+
75
+ for (row=1; row<=l2; row++) {
76
+ /* Copy the current row to the previous row. */
77
+
78
+ memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
79
+
80
+ /* Calculate the values of the current row. */
81
+
82
+ curr_row[0] = row;
83
+ curr_row_min = row;
84
+
85
+ for (col=1; col<=l1; col++) {
86
+ /* Equal (cost=0) or substitution (cost=1). */
87
+
88
+ curr_row[col] = prev_row[col-1] + (RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset+col-1)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(offset+row-1)))) ? 0 : 1);
89
+
90
+ /* Insertion if it's cheaper than substitution. */
91
+
92
+ if (prev_row[col]+1 < curr_row[col]) {
93
+ curr_row[col] = prev_row[col]+1;
94
+ }
95
+
96
+ /* Deletion if it's cheaper than substitution. */
97
+
98
+ if (curr_row[col-1]+1 < curr_row[col]) {
99
+ curr_row[col] = curr_row[col-1]+1;
100
+ }
101
+
102
+ /* Keep track of the minimum value on this row. */
103
+
104
+ if (curr_row[col] < curr_row_min) {
105
+ curr_row_min = curr_row[col];
106
+ }
107
+ }
108
+
109
+ /* Return nil as soon as we exceed the threshold. */
110
+
111
+ if (threshold > -1 && curr_row_min >= threshold) {
112
+ free(prev_row);
113
+ free(curr_row);
114
+
115
+ return Qnil;
116
+ }
117
+ }
118
+
119
+ /* The result is the last value on the last row. */
120
+
121
+ result = curr_row[l1];
122
+
123
+ free(prev_row);
124
+ free(curr_row);
125
+
126
+ /* Return the Ruby version of the result. */
127
+
128
+ return INT2FIX(result);
129
+ }
@@ -1,25 +1,25 @@
1
1
  #include "ruby.h"
2
2
 
3
- static VALUE levenshtein_distance_part2(VALUE self, VALUE rb_s1, VALUE rb_s2, VALUE rb_threshold) {
4
- VALUE rb_s3;
3
+ VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
5
4
  int threshold;
6
- int l1, l2, l3;
7
- char *s1, *s2, *s3;
5
+ int l1, l2;
8
6
  int *prev_row, *curr_row;
9
7
  int col, row;
10
8
  int curr_row_min, result;
9
+ int offset;
10
+ char *s1, *s2;
11
11
 
12
12
  /* Convert Ruby's s1 to C's s1. */
13
13
 
14
- rb_s1 = StringValue(rb_s1);
15
- s1 = RSTRING(rb_s1)->ptr;
16
- l1 = RSTRING(rb_s1)->len;
14
+ rb_o1 = StringValue(rb_o1);
15
+ s1 = RSTRING(rb_o1)->ptr;
16
+ l1 = RSTRING(rb_o1)->len;
17
17
 
18
18
  /* Convert Ruby's s2 to C's s2. */
19
19
 
20
- rb_s2 = StringValue(rb_s2);
21
- s2 = RSTRING(rb_s2)->ptr;
22
- l2 = RSTRING(rb_s2)->len;
20
+ rb_o2 = StringValue(rb_o2);
21
+ s2 = RSTRING(rb_o2)->ptr;
22
+ l2 = RSTRING(rb_o2)->len;
23
23
 
24
24
  /* Convert Ruby's threshold to C's threshold. */
25
25
 
@@ -29,7 +29,24 @@ static VALUE levenshtein_distance_part2(VALUE self, VALUE rb_s1, VALUE rb_s2, VA
29
29
  threshold = -1;
30
30
  }
31
31
 
32
- /* The Levenshtein Algorithm itself. */
32
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
33
+
34
+ offset = 0;
35
+ while (s1[offset] == s2[offset]) {
36
+ offset++;
37
+ }
38
+
39
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
40
+
41
+ while ((l1-1 > offset) && (l2-1 > offset) && (s1[l1-1] == s2[l2-1])) {
42
+ l1--;
43
+ l2--;
44
+ }
45
+
46
+ l1 -= offset;
47
+ l2 -= offset;
48
+
49
+ /* The Levenshtein algorithm itself. */
33
50
 
34
51
  /* s1= */
35
52
  /* ERIK */
@@ -43,7 +60,7 @@ static VALUE levenshtein_distance_part2(VALUE self, VALUE rb_s1, VALUE rb_s2, VA
43
60
  /* T 65555 */
44
61
  /* R 76566 */
45
62
  /* A 87667 */
46
-
63
+
47
64
  /* Allocate memory for both rows */
48
65
 
49
66
  prev_row = ALLOC_N(int, l1+1);
@@ -70,9 +87,9 @@ static VALUE levenshtein_distance_part2(VALUE self, VALUE rb_s1, VALUE rb_s2, VA
70
87
  curr_row_min = row;
71
88
 
72
89
  for (col=1; col<=l1; col++) {
73
- /* Equal (cost=0) or Substitution (cost=1). */
90
+ /* Equal (cost=0) or substitution (cost=1). */
74
91
 
75
- curr_row[col] = prev_row[col-1] + ((s1[col-1] == s2[row-1]) ? 0 : 1);
92
+ curr_row[col] = prev_row[col-1] + ((s1[offset+col-1] == s2[offset+row-1]) ? 0 : 1);
76
93
 
77
94
  /* Insertion if it's cheaper than substitution. */
78
95
 
@@ -114,9 +131,3 @@ static VALUE levenshtein_distance_part2(VALUE self, VALUE rb_s1, VALUE rb_s2, VA
114
131
 
115
132
  return INT2FIX(result);
116
133
  }
117
-
118
- void Init_levenshtein_c() {
119
- VALUE mLevenshtein = rb_define_module("Levenshtein");
120
-
121
- rb_define_singleton_method(mLevenshtein, "distance_part2_fast" , levenshtein_distance_part2, 3);
122
- }
@@ -1,30 +1,34 @@
1
1
  begin
2
- require "levenshtein/levenshtein_c"
2
+ require "levenshtein/levenshtein_fast" # If compiled by RubyGems.
3
3
  rescue LoadError
4
4
  begin
5
- require "levenshtein_c"
5
+ require "levenshtein_fast" # If compiled by the build script.
6
6
  rescue LoadError
7
- $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.distance_part2. Using the slow Ruby version instead."
7
+ $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.distance. Using the much slower Ruby version instead."
8
8
  end
9
9
  end
10
10
 
11
- # The Levenshtein distance is a metric for measuring the amount of difference
12
- # between two sequences (i.e., the so called edit distance). The Levenshtein
13
- # distance between two strings is given by the minimum number of operations
14
- # needed to transform one string into the other, where an operation is an
15
- # insertion, deletion, or substitution of a single character.
11
+ # The Levenshtein distance is a metric for measuring the amount
12
+ # of difference between two sequences (i.e., the so called edit
13
+ # distance). The Levenshtein distance between two sequences is
14
+ # given by the minimum number of operations needed to transform
15
+ # one sequence into the other, where an operation is an
16
+ # insertion, deletion, or substitution of a single element.
16
17
  #
17
18
  # More information about the Levenshtein distance algorithm:
18
19
  # http://en.wikipedia.org/wiki/Levenshtein_distance .
19
20
 
20
21
  module Levenshtein
21
- # Returns the Levenshtein distance as a number between 0.0 and 1.0.
22
- # It's basically the Levenshtein distance divided by the length of the longest string.
22
+ VERSION = "0.2.0"
23
+
24
+ # Returns the Levenshtein distance as a number between 0.0 and
25
+ # 1.0. It's basically the Levenshtein distance divided by the
26
+ # length of the longest sequence.
23
27
 
24
28
  def self.normalized_distance(s1, s2, threshold=nil)
25
29
  s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
26
30
 
27
- if s2.empty?
31
+ if s2.length == 0
28
32
  0.0 # Since s1.length < s2.length, s1 must be empty as well.
29
33
  else
30
34
  if threshold
@@ -39,46 +43,49 @@ module Levenshtein
39
43
  end
40
44
  end
41
45
 
42
- # Returns the Levenshtein distance between two byte strings.
46
+ # Returns the Levenshtein distance between two sequences.
47
+ #
48
+ # The two sequences can be two strings, two arrays, or two other
49
+ # objects. Strings, arrays and arrays of strings are handled with
50
+ # optimized (very fast) C code. All other sequences are handled
51
+ # with generic (fast) C code.
52
+ #
53
+ # The sequences should respond to :length and :[] and all objects
54
+ # in the sequences (as returned by []) should response to :==.
43
55
 
44
56
  def self.distance(s1, s2, threshold=nil)
45
57
  s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
46
58
 
47
59
  # Handle some basic circumstances.
48
60
 
49
- return 0.0 if s1 == s2
50
- return s2.length if s1.empty?
51
- return nil if threshold and (s2.length-s1.length) >= threshold
52
- return nil if threshold and (s1.scan(/./) - s2.scan(/./)).length >= threshold
53
- return nil if threshold and (s2.scan(/./) - s1.scan(/./)).length >= threshold
54
-
55
- # Do the expensive calculation on a subset of the strings only, if possible.
61
+ return 0 if s1 == s2
62
+ return s2.length if s1.length == 0
56
63
 
57
- b = 0
58
- e1 = s1.length-1
59
- e2 = s2.length-1
64
+ if threshold
65
+ return nil if (s2.length-s1.length) >= threshold
60
66
 
61
- while s1[b, 1] == s2[b, 1]
62
- b += 1
63
- end
67
+ a1, a2 = nil, nil
68
+ a1, a2 = s1, s2 if s1.respond_to?(:-) and s2.respond_to?(:-)
69
+ a1, a2 = s1.scan(/./), s2.scan(/./) if s1.respond_to?(:scan) and s2.respond_to?(:scan)
64
70
 
65
- while s1[e1, 1] == s2[e2, 1] and e1 > b and e2 > b
66
- e1 -= 1
67
- e2 -= 1
71
+ if a1 and a2
72
+ return nil if (a1-a2).length >= threshold
73
+ return nil if (a2-a1).length >= threshold
74
+ end
68
75
  end
69
76
 
70
- distance_part2(s1[b..e1], s2[b..e2], threshold)
77
+ distance_fast_or_slow(s1, s2, threshold)
71
78
  end
72
79
 
73
- def self.distance_part2(s1, s2, threshold) # :nodoc:
74
- if respond_to?(:distance_part2_fast)
75
- distance_part2_fast(s1, s2, threshold) # Implemented in C.
80
+ def self.distance_fast_or_slow(s1, s2, threshold) # :nodoc:
81
+ if respond_to?(:levenshtein_distance_fast)
82
+ levenshtein_distance_fast(s1, s2, threshold) # Implemented in C.
76
83
  else
77
- distance_part2_slow(s1, s2, threshold) # Implemented in Ruby.
84
+ levenshtein_distance_slow(s1, s2, threshold) # Implemented in Ruby.
78
85
  end
79
86
  end
80
87
 
81
- def self.distance_part2_slow(s1, s2, threshold) # :nodoc:
88
+ def self.levenshtein_distance_slow(s1, s2, threshold) # :nodoc:
82
89
  row = (0..s1.length).to_a
83
90
 
84
91
  1.upto(s2.length) do |y|
@@ -89,8 +96,10 @@ module Levenshtein
89
96
  row[x] = [prow[x]+1, row[x-1]+1, prow[x-1]+(s1[x-1]==s2[y-1] ? 0 : 1)].min
90
97
  end
91
98
 
92
- # Stop analysing this string as soon as the best possible result for this string is bigger than the best result so far.
93
- # (The minimum value in the next row will be equal to or greater than the minimum value in this row.)
99
+ # Stop analysing this sequence as soon as the best possible
100
+ # result for this sequence is bigger than the best result so far.
101
+ # (The minimum value in the next row will be equal to or greater
102
+ # than the minimum value in this row.)
94
103
 
95
104
  return nil if threshold and row.min >= threshold
96
105
  end
@@ -1,7 +1,35 @@
1
1
  require "test/unit"
2
2
  require "levenshtein"
3
3
 
4
- class TestLevenshtein < Test::Unit::TestCase
4
+ module Levenshtein
5
+ class TestSequence
6
+ def initialize(o)
7
+ @sequence = o
8
+ end
9
+
10
+ def length
11
+ @sequence.length
12
+ end
13
+
14
+ def [](pos)
15
+ @sequence[pos]
16
+ end
17
+ end
18
+
19
+ class TestElement
20
+ attr_reader :object
21
+
22
+ def initialize(o)
23
+ @object = o
24
+ end
25
+
26
+ def ==(other)
27
+ @object == other.object
28
+ end
29
+ end
30
+ end
31
+
32
+ class TestLevenshteinString < Test::Unit::TestCase
5
33
  def test_erik_veenstra
6
34
  assert_equal(7, Levenshtein.distance("erik", "veenstra"))
7
35
  assert_equal(7, Levenshtein.distance("veenstra", "erik"))
@@ -30,9 +58,11 @@ class TestLevenshtein < Test::Unit::TestCase
30
58
 
31
59
  def test_threshold
32
60
  assert_equal(3, Levenshtein.distance("foo", "foobar"))
61
+ assert_equal(3, Levenshtein.distance("foo", "foobar", 4))
33
62
  assert_equal(nil, Levenshtein.distance("foo", "foobar", 2))
34
63
 
35
64
  assert_in_delta(0.5, Levenshtein.normalized_distance("foo", "foobar"), 0.01)
65
+ assert_in_delta(0.5, Levenshtein.normalized_distance("foo", "foobar", 0.66), 0.01)
36
66
  assert_equal(nil, Levenshtein.normalized_distance("foo", "foobar", 0.30))
37
67
  end
38
68
 
@@ -45,47 +75,51 @@ class TestLevenshtein < Test::Unit::TestCase
45
75
  assert_in_delta(0.42, Levenshtein.normalized_distance("ab123cd", "abxyzcd"), 0.01)
46
76
  assert_in_delta(0.6, Levenshtein.normalized_distance("ab123", "abxyz"), 0.01)
47
77
  assert_in_delta(0.6, Levenshtein.normalized_distance("123cd", "xyzcd"), 0.01)
78
+ assert_in_delta(0.625, Levenshtein.normalized_distance("123cd123", "123"), 0.01)
48
79
  end
49
80
  end
50
81
 
51
- class TestLevenshteinPart2Slow < Test::Unit::TestCase
82
+ class TestLevenshteinArray < Test::Unit::TestCase
52
83
  def test_erik_veenstra
53
- assert_equal(7, Levenshtein.distance_part2_slow("erik", "veenstra", nil))
54
- end
84
+ x = lambda{|s| s.scan(/./).collect{|e| Levenshtein::TestElement.new(e)}}
55
85
 
56
- def test_empty_string
57
- assert_equal(0, Levenshtein.distance_part2_slow("", "", nil))
58
- assert_equal(3, Levenshtein.distance_part2_slow("", "foo", nil))
86
+ assert_equal(7, Levenshtein.distance(x["erik"], x["veenstra"]))
59
87
  end
88
+ end
60
89
 
61
- def test_same_string
62
- assert_equal(0, Levenshtein.distance_part2_slow("", "", nil))
63
- assert_equal(0, Levenshtein.distance_part2_slow("foo", "foo", nil))
90
+ class TestLevenshteinArrayOfStrings < Test::Unit::TestCase
91
+ def test_erik_veenstra
92
+ x = lambda{|s| s.scan(/./)}
93
+
94
+ assert_equal(7, Levenshtein.distance(x["erik"], x["veenstra"]))
64
95
  end
96
+ end
65
97
 
66
- def test_threshold
67
- assert_equal(3, Levenshtein.distance_part2_slow("foo", "foobar", nil))
68
- assert_equal(nil, Levenshtein.distance_part2_slow("foo", "foobar", 2))
98
+ class TestLevenshteinGeneric < Test::Unit::TestCase
99
+ def test_erik_veenstra
100
+ x = lambda{|s| Levenshtein::TestSequence.new(s.scan(/./).collect{|e| Levenshtein::TestElement.new(e)})}
101
+
102
+ assert_equal(7, Levenshtein.distance(x["erik"], x["veenstra"]))
69
103
  end
70
104
  end
71
105
 
72
- class TestLevenshteinPart2Fast < Test::Unit::TestCase
106
+ class TestLevenshteinSlow < Test::Unit::TestCase
73
107
  def test_erik_veenstra
74
- assert_equal(7, Levenshtein.distance_part2_fast("erik", "veenstra", nil))
108
+ assert_equal(7, Levenshtein.levenshtein_distance_slow("erik", "veenstra", nil))
75
109
  end
76
110
 
77
- def test_empty_string
78
- assert_equal(0, Levenshtein.distance_part2_fast("", "", nil))
79
- assert_equal(3, Levenshtein.distance_part2_fast("", "foo", nil))
111
+ def test_empty_sequence
112
+ assert_equal(0, Levenshtein.levenshtein_distance_slow("", "", nil))
113
+ assert_equal(3, Levenshtein.levenshtein_distance_slow("", "foo", nil))
80
114
  end
81
115
 
82
- def test_same_string
83
- assert_equal(0, Levenshtein.distance_part2_fast("", "", nil))
84
- assert_equal(0, Levenshtein.distance_part2_fast("foo", "foo", nil))
116
+ def test_same_sequence
117
+ assert_equal(0, Levenshtein.levenshtein_distance_slow("", "", nil))
118
+ assert_equal(0, Levenshtein.levenshtein_distance_slow("foo", "foo", nil))
85
119
  end
86
120
 
87
121
  def test_threshold
88
- assert_equal(3, Levenshtein.distance_part2_fast("foo", "foobar", nil))
89
- assert_equal(nil, Levenshtein.distance_part2_fast("foo", "foobar", 2))
122
+ assert_equal(3, Levenshtein.levenshtein_distance_slow("foo", "foobar", nil))
123
+ assert_equal(nil, Levenshtein.levenshtein_distance_slow("foo", "foobar", 2))
90
124
  end
91
125
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: levenshtein
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Erik Veenstra
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-10-06 00:00:00 +02:00
12
+ date: 2009-07-11 00:00:00 +02:00
13
13
  default_executable:
14
14
  dependencies: []
15
15
 
@@ -23,15 +23,20 @@ extra_rdoc_files: []
23
23
 
24
24
  files:
25
25
  - lib/levenshtein.rb
26
- - ext/levenshtein
27
26
  - ext/levenshtein/extconf.rb
28
- - ext/levenshtein/levenshtein_c.c
27
+ - ext/levenshtein/levenshtein_array_of_strings.c
28
+ - ext/levenshtein/levenshtein_fast.c
29
+ - ext/levenshtein/levenshtein_string.c
30
+ - ext/levenshtein/levenshtein_generic.c
31
+ - ext/levenshtein/levenshtein_array.c
29
32
  - README
30
33
  - LICENSE
31
34
  - VERSION
32
35
  - CHANGELOG
33
36
  has_rdoc: true
34
37
  homepage: http://www.erikveen.dds.nl/levenshtein/index.html
38
+ licenses: []
39
+
35
40
  post_install_message:
36
41
  rdoc_options:
37
42
  - README
@@ -39,7 +44,7 @@ rdoc_options:
39
44
  - VERSION
40
45
  - CHANGELOG
41
46
  - --title
42
- - levenshtein (0.1.1)
47
+ - levenshtein (0.2.0)
43
48
  - --main
44
49
  - README
45
50
  require_paths:
@@ -59,9 +64,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
59
64
  requirements: []
60
65
 
61
66
  rubyforge_project: levenshtein
62
- rubygems_version: 1.2.0
67
+ rubygems_version: 1.3.4
63
68
  signing_key:
64
- specification_version: 2
69
+ specification_version: 3
65
70
  summary: Calculates the Levenshtein distance between two byte strings.
66
71
  test_files:
67
72
  - test/test.rb