levenshtein 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,3 +1,11 @@
1
+ 0.2.1 (11-02-2012)
2
+
3
+ * Better memory handling.
4
+
5
+ * Little speed improvements.
6
+
7
+ * Ruby 1.9 compatible?
8
+
1
9
  0.2.0 (11-07-2009)
2
10
 
3
11
  * Return 0 instead of 0.0 in case of empty strings.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.0
1
+ 0.2.1
@@ -0,0 +1,13 @@
1
+ #ifdef RARRAY_PTR
2
+ #else
3
+ #define RARRAY_PTR(o) (RARRAY(o)->ptr)
4
+ #define RARRAY_LEN(o) (RARRAY(o)->len)
5
+ #endif
6
+
7
+ #ifdef RSTRING_PTR
8
+ #else
9
+ #define RSTRING_PTR(o) (RSTRING(o)->ptr)
10
+ #define RSTRING_LEN(o) (RSTRING(o)->len)
11
+ #endif
12
+
13
+ VALUE mLevenshtein;
@@ -1,19 +1,19 @@
1
1
  #include "ruby.h"
2
+ #include "levenshtein.h"
2
3
 
3
4
  VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
5
  int threshold;
5
6
  int l1, l2;
6
- int *prev_row, *curr_row;
7
+ int *prev_row, *curr_row, *temp_row;
7
8
  int col, row;
8
9
  int curr_row_min, result;
9
10
  int offset;
10
-
11
- ID id_eql = rb_intern("==");
11
+ int value1, value2;
12
12
 
13
13
  /* Get the sizes of both arrays. */
14
14
 
15
- l1 = RARRAY(rb_o1)->len;
16
- l2 = RARRAY(rb_o2)->len;
15
+ l1 = RARRAY_LEN(rb_o1);
16
+ l2 = RARRAY_LEN(rb_o2);
17
17
 
18
18
  /* Convert Ruby's threshold to C's threshold. */
19
19
 
@@ -26,13 +26,14 @@ VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_
26
26
  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
27
27
 
28
28
  offset = 0;
29
- while RTEST(rb_funcall(rb_ary_entry(rb_o1, offset), id_eql, 1, rb_ary_entry(rb_o2, offset))) {
29
+
30
+ while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)))) {
30
31
  offset++;
31
32
  }
32
33
 
33
34
  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
34
35
 
35
- while ((l1-1 > offset) && (l2-1 > offset) && RTEST(rb_funcall(rb_ary_entry(rb_o1, l1-1), id_eql, 1, rb_ary_entry(rb_o2, l2-1)))) {
36
+ while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)))) {
36
37
  l1--;
37
38
  l2--;
38
39
  }
@@ -57,12 +58,8 @@ VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_
57
58
 
58
59
  /* Allocate memory for both rows */
59
60
 
60
- prev_row = ALLOC_N(int, l1+1);
61
- curr_row = ALLOC_N(int, l1+1);
62
-
63
- if ((prev_row == NULL) || (curr_row == NULL)) {
64
- rb_raise(rb_eNoMemError, "out of memory");
65
- }
61
+ prev_row = (int*) ALLOC_N(int, (l1+1));
62
+ curr_row = (int*) ALLOC_N(int, (l1+1));
66
63
 
67
64
  /* Initialize the current row. */
68
65
 
@@ -73,7 +70,9 @@ VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_
73
70
  for (row=1; row<=l2; row++) {
74
71
  /* Copy the current row to the previous row. */
75
72
 
76
- memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
73
+ temp_row = prev_row;
74
+ prev_row = curr_row;
75
+ curr_row = temp_row;
77
76
 
78
77
  /* Calculate the values of the current row. */
79
78
 
@@ -83,25 +82,29 @@ VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_
83
82
  for (col=1; col<=l1; col++) {
84
83
  /* Equal (cost=0) or substitution (cost=1). */
85
84
 
86
- curr_row[col] = prev_row[col-1] + (RTEST(rb_funcall(rb_ary_entry(rb_o1, offset+col-1), id_eql, 1, rb_ary_entry(rb_o2, offset+row-1))) ? 0 : 1);
85
+ value1 = prev_row[col-1] + (RTEST(rb_equal(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1))) ? 0 : 1);
87
86
 
88
87
  /* Insertion if it's cheaper than substitution. */
89
88
 
90
- if (prev_row[col]+1 < curr_row[col]) {
91
- curr_row[col] = prev_row[col]+1;
89
+ value2 = prev_row[col]+1;
90
+ if (value2 < value1) {
91
+ value1 = value2;
92
92
  }
93
93
 
94
94
  /* Deletion if it's cheaper than substitution. */
95
95
 
96
- if (curr_row[col-1]+1 < curr_row[col]) {
97
- curr_row[col] = curr_row[col-1]+1;
96
+ value2 = curr_row[col-1]+1;
97
+ if (value2 < value1) {
98
+ value1 = value2;
98
99
  }
99
100
 
100
101
  /* Keep track of the minimum value on this row. */
101
102
 
102
- if (curr_row[col] < curr_row_min) {
103
- curr_row_min = curr_row[col];
103
+ if (value1 < curr_row_min) {
104
+ curr_row_min = value1;
104
105
  }
106
+
107
+ curr_row[col] = value1;
105
108
  }
106
109
 
107
110
  /* Return nil as soon as we exceed the threshold. */
@@ -1,17 +1,19 @@
1
1
  #include "ruby.h"
2
+ #include "levenshtein.h"
2
3
 
3
4
  VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
5
  int threshold;
5
6
  int l1, l2;
6
- int *prev_row, *curr_row;
7
+ int *prev_row, *curr_row, *temp_row;
7
8
  int col, row;
8
9
  int curr_row_min, result;
9
10
  int offset;
11
+ int value1, value2;
10
12
 
11
13
  /* Get the sizes of both arrays. */
12
14
 
13
- l1 = RARRAY(rb_o1)->len;
14
- l2 = RARRAY(rb_o2)->len;
15
+ l1 = RARRAY_LEN(rb_o1);
16
+ l2 = RARRAY_LEN(rb_o2);
15
17
 
16
18
  /* Convert Ruby's threshold to C's threshold. */
17
19
 
@@ -24,13 +26,14 @@ VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2
24
26
  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
25
27
 
26
28
  offset = 0;
27
- while (rb_str_cmp(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)) == 0) {
29
+
30
+ while ((offset < l1) && (offset < l2) && (rb_str_cmp(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)) == 0)) {
28
31
  offset++;
29
32
  }
30
33
 
31
34
  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
32
35
 
33
- while ((l1-1 > offset) && (l2-1 > offset) && (rb_str_cmp(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)) == 0 )) {
36
+ while ((offset < l1) && (offset < l2) && (rb_str_cmp(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)) == 0 )) {
34
37
  l1--;
35
38
  l2--;
36
39
  }
@@ -55,12 +58,8 @@ VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2
55
58
 
56
59
  /* Allocate memory for both rows */
57
60
 
58
- prev_row = ALLOC_N(int, l1+1);
59
- curr_row = ALLOC_N(int, l1+1);
60
-
61
- if ((prev_row == NULL) || (curr_row == NULL)) {
62
- rb_raise(rb_eNoMemError, "out of memory");
63
- }
61
+ prev_row = (int*) ALLOC_N(int, (l1+1));
62
+ curr_row = (int*) ALLOC_N(int, (l1+1));
64
63
 
65
64
  /* Initialize the current row. */
66
65
 
@@ -71,7 +70,9 @@ VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2
71
70
  for (row=1; row<=l2; row++) {
72
71
  /* Copy the current row to the previous row. */
73
72
 
74
- memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
73
+ temp_row = prev_row;
74
+ prev_row = curr_row;
75
+ curr_row = temp_row;
75
76
 
76
77
  /* Calculate the values of the current row. */
77
78
 
@@ -81,25 +82,29 @@ VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2
81
82
  for (col=1; col<=l1; col++) {
82
83
  /* Equal (cost=0) or substitution (cost=1). */
83
84
 
84
- curr_row[col] = prev_row[col-1] + ((rb_str_cmp(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1)) == 0) ? 0 : 1);
85
+ value1 = prev_row[col-1] + ((rb_str_cmp(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1)) == 0) ? 0 : 1);
85
86
 
86
87
  /* Insertion if it's cheaper than substitution. */
87
88
 
88
- if (prev_row[col]+1 < curr_row[col]) {
89
- curr_row[col] = prev_row[col]+1;
89
+ value2 = prev_row[col]+1;
90
+ if (value2 < value1) {
91
+ value1 = value2;
90
92
  }
91
93
 
92
94
  /* Deletion if it's cheaper than substitution. */
93
95
 
94
- if (curr_row[col-1]+1 < curr_row[col]) {
95
- curr_row[col] = curr_row[col-1]+1;
96
+ value2 = curr_row[col-1]+1;
97
+ if (value2 < value1) {
98
+ value1 = value2;
96
99
  }
97
100
 
98
101
  /* Keep track of the minimum value on this row. */
99
102
 
100
- if (curr_row[col] < curr_row_min) {
101
- curr_row_min = curr_row[col];
103
+ if (value1 < curr_row_min) {
104
+ curr_row_min = value1;
102
105
  }
106
+
107
+ curr_row[col] = value1;
103
108
  }
104
109
 
105
110
  /* Return nil as soon as we exceed the threshold. */
@@ -1,4 +1,5 @@
1
1
  #include "ruby.h"
2
+ #include "levenshtein.h"
2
3
 
3
4
  VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
5
  if ((TYPE(rb_o1) == T_STRING) && (TYPE(rb_o2)) == T_STRING) {
@@ -15,7 +16,7 @@ VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_t
15
16
  }
16
17
 
17
18
  void Init_levenshtein_fast() {
18
- VALUE mLevenshtein = rb_define_module("Levenshtein");
19
+ mLevenshtein = rb_const_get(rb_mKernel, rb_intern("Levenshtein"));
19
20
 
20
- rb_define_singleton_method(mLevenshtein, "levenshtein_distance_fast" , levenshtein_distance_fast, 3);
21
+ rb_define_singleton_method(mLevenshtein, "distance_fast" , levenshtein_distance_fast, 3);
21
22
  }
@@ -1,16 +1,17 @@
1
1
  #include "ruby.h"
2
+ #include "levenshtein.h"
2
3
 
3
4
  VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
5
  int threshold;
5
6
  int l1, l2;
6
- int *prev_row, *curr_row;
7
+ int *prev_row, *curr_row, *temp_row;
7
8
  int col, row;
8
9
  int curr_row_min, result;
9
10
  int offset;
11
+ int value1, value2;
10
12
 
11
- ID id_length = rb_intern("length");
12
- ID id_get = rb_intern("[]");
13
- ID id_equal = rb_intern("==");
13
+ ID id_length = rb_intern("length");
14
+ ID id_get = rb_intern("[]");
14
15
 
15
16
  /* Get the sizes of both sequences. */
16
17
 
@@ -28,13 +29,14 @@ VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE r
28
29
  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
29
30
 
30
31
  offset = 0;
31
- while RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(offset)))) {
32
+
33
+ while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset)), rb_funcall(rb_o2, id_get, 1, INT2FIX(offset))))) {
32
34
  offset++;
33
35
  }
34
36
 
35
37
  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
36
38
 
37
- while ((l1-1 > offset) && (l2-1 > offset) && RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(l1-1)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(l2-1))))) {
39
+ while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_funcall(rb_o1, id_get, 1, INT2FIX(l1-1)), rb_funcall(rb_o2, id_get, 1, INT2FIX(l2-1))))) {
38
40
  l1--;
39
41
  l2--;
40
42
  }
@@ -59,12 +61,8 @@ VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE r
59
61
 
60
62
  /* Allocate memory for both rows */
61
63
 
62
- prev_row = ALLOC_N(int, l1+1);
63
- curr_row = ALLOC_N(int, l1+1);
64
-
65
- if ((prev_row == NULL) || (curr_row == NULL)) {
66
- rb_raise(rb_eNoMemError, "out of memory");
67
- }
64
+ prev_row = (int*) ALLOC_N(int, (l1+1));
65
+ curr_row = (int*) ALLOC_N(int, (l1+1));
68
66
 
69
67
  /* Initialize the current row. */
70
68
 
@@ -75,7 +73,9 @@ VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE r
75
73
  for (row=1; row<=l2; row++) {
76
74
  /* Copy the current row to the previous row. */
77
75
 
78
- memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
76
+ temp_row = prev_row;
77
+ prev_row = curr_row;
78
+ curr_row = temp_row;
79
79
 
80
80
  /* Calculate the values of the current row. */
81
81
 
@@ -85,25 +85,29 @@ VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE r
85
85
  for (col=1; col<=l1; col++) {
86
86
  /* Equal (cost=0) or substitution (cost=1). */
87
87
 
88
- curr_row[col] = prev_row[col-1] + (RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset+col-1)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(offset+row-1)))) ? 0 : 1);
88
+ value1 = prev_row[col-1] + (RTEST(rb_equal(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset+col-1)), rb_funcall(rb_o2, id_get, 1, INT2FIX(offset+row-1)))) ? 0 : 1);
89
89
 
90
90
  /* Insertion if it's cheaper than substitution. */
91
91
 
92
- if (prev_row[col]+1 < curr_row[col]) {
93
- curr_row[col] = prev_row[col]+1;
92
+ value2 = prev_row[col]+1;
93
+ if (value2 < value1) {
94
+ value1 = value2;
94
95
  }
95
96
 
96
97
  /* Deletion if it's cheaper than substitution. */
97
98
 
98
- if (curr_row[col-1]+1 < curr_row[col]) {
99
- curr_row[col] = curr_row[col-1]+1;
99
+ value2 = curr_row[col-1]+1;
100
+ if (value2 < value1) {
101
+ value1 = value2;
100
102
  }
101
103
 
102
104
  /* Keep track of the minimum value on this row. */
103
105
 
104
- if (curr_row[col] < curr_row_min) {
105
- curr_row_min = curr_row[col];
106
+ if (value1 < curr_row_min) {
107
+ curr_row_min = value1;
106
108
  }
109
+
110
+ curr_row[col] = value1;
107
111
  }
108
112
 
109
113
  /* Return nil as soon as we exceed the threshold. */
@@ -1,25 +1,27 @@
1
1
  #include "ruby.h"
2
+ #include "levenshtein.h"
2
3
 
3
4
  VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
5
  int threshold;
5
6
  int l1, l2;
6
- int *prev_row, *curr_row;
7
+ int *prev_row, *curr_row, *temp_row;
7
8
  int col, row;
8
9
  int curr_row_min, result;
9
10
  int offset;
11
+ int value1, value2;
10
12
  char *s1, *s2;
11
13
 
12
14
  /* Convert Ruby's s1 to C's s1. */
13
15
 
14
16
  rb_o1 = StringValue(rb_o1);
15
- s1 = RSTRING(rb_o1)->ptr;
16
- l1 = RSTRING(rb_o1)->len;
17
+ s1 = RSTRING_PTR(rb_o1);
18
+ l1 = RSTRING_LEN(rb_o1);
17
19
 
18
20
  /* Convert Ruby's s2 to C's s2. */
19
21
 
20
22
  rb_o2 = StringValue(rb_o2);
21
- s2 = RSTRING(rb_o2)->ptr;
22
- l2 = RSTRING(rb_o2)->len;
23
+ s2 = RSTRING_PTR(rb_o2);
24
+ l2 = RSTRING_LEN(rb_o2);
23
25
 
24
26
  /* Convert Ruby's threshold to C's threshold. */
25
27
 
@@ -32,13 +34,14 @@ VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb
32
34
  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
33
35
 
34
36
  offset = 0;
35
- while (s1[offset] == s2[offset]) {
37
+
38
+ while ((offset < l1) && (offset < l2) && (s1[offset] == s2[offset])) {
36
39
  offset++;
37
40
  }
38
41
 
39
42
  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
40
43
 
41
- while ((l1-1 > offset) && (l2-1 > offset) && (s1[l1-1] == s2[l2-1])) {
44
+ while ((offset < l1) && (offset < l2) && (s1[l1-1] == s2[l2-1])) {
42
45
  l1--;
43
46
  l2--;
44
47
  }
@@ -63,12 +66,8 @@ VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb
63
66
 
64
67
  /* Allocate memory for both rows */
65
68
 
66
- prev_row = ALLOC_N(int, l1+1);
67
- curr_row = ALLOC_N(int, l1+1);
68
-
69
- if ((prev_row == NULL) || (curr_row == NULL)) {
70
- rb_raise(rb_eNoMemError, "out of memory");
71
- }
69
+ prev_row = (int*) ALLOC_N(int, (l1+1));
70
+ curr_row = (int*) ALLOC_N(int, (l1+1));
72
71
 
73
72
  /* Initialize the current row. */
74
73
 
@@ -79,7 +78,9 @@ VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb
79
78
  for (row=1; row<=l2; row++) {
80
79
  /* Copy the current row to the previous row. */
81
80
 
82
- memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
81
+ temp_row = prev_row;
82
+ prev_row = curr_row;
83
+ curr_row = temp_row;
83
84
 
84
85
  /* Calculate the values of the current row. */
85
86
 
@@ -89,25 +90,29 @@ VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb
89
90
  for (col=1; col<=l1; col++) {
90
91
  /* Equal (cost=0) or substitution (cost=1). */
91
92
 
92
- curr_row[col] = prev_row[col-1] + ((s1[offset+col-1] == s2[offset+row-1]) ? 0 : 1);
93
+ value1 = prev_row[col-1] + ((s1[offset+col-1] == s2[offset+row-1]) ? 0 : 1);
93
94
 
94
95
  /* Insertion if it's cheaper than substitution. */
95
96
 
96
- if (prev_row[col]+1 < curr_row[col]) {
97
- curr_row[col] = prev_row[col]+1;
97
+ value2 = prev_row[col]+1;
98
+ if (value2 < value1) {
99
+ value1 = value2;
98
100
  }
99
101
 
100
102
  /* Deletion if it's cheaper than substitution. */
101
103
 
102
- if (curr_row[col-1]+1 < curr_row[col]) {
103
- curr_row[col] = curr_row[col-1]+1;
104
+ value2 = curr_row[col-1]+1;
105
+ if (value2 < value1) {
106
+ value1 = value2;
104
107
  }
105
108
 
106
109
  /* Keep track of the minimum value on this row. */
107
110
 
108
- if (curr_row[col] < curr_row_min) {
109
- curr_row_min = curr_row[col];
111
+ if (value1 < curr_row_min) {
112
+ curr_row_min = value1;
110
113
  }
114
+
115
+ curr_row[col] = value1;
111
116
  }
112
117
 
113
118
  /* Return nil as soon as we exceed the threshold. */
@@ -1,44 +1,25 @@
1
- begin
2
- require "levenshtein/levenshtein_fast" # If compiled by RubyGems.
3
- rescue LoadError
4
- begin
5
- require "levenshtein_fast" # If compiled by the build script.
6
- rescue LoadError
7
- $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.distance. Using the much slower Ruby version instead."
8
- end
9
- end
10
-
11
- # The Levenshtein distance is a metric for measuring the amount
12
- # of difference between two sequences (i.e., the so called edit
13
- # distance). The Levenshtein distance between two sequences is
14
- # given by the minimum number of operations needed to transform
15
- # one sequence into the other, where an operation is an
16
- # insertion, deletion, or substitution of a single element.
17
- #
18
- # More information about the Levenshtein distance algorithm:
19
- # http://en.wikipedia.org/wiki/Levenshtein_distance .
1
+ require "levenshtein/exception"
2
+ require "levenshtein/version"
20
3
 
21
4
  module Levenshtein
22
- VERSION = "0.2.0"
23
-
24
5
  # Returns the Levenshtein distance as a number between 0.0 and
25
6
  # 1.0. It's basically the Levenshtein distance divided by the
26
7
  # length of the longest sequence.
27
8
 
28
- def self.normalized_distance(s1, s2, threshold=nil)
29
- s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
9
+ def self.normalized_distance(a1, a2, threshold=nil)
10
+ a1, a2 = a2, a1 if a1.length > a2.length # a1 is the short one; a2 is the long one.
30
11
 
31
- if s2.length == 0
32
- 0.0 # Since s1.length < s2.length, s1 must be empty as well.
12
+ if a2.length == 0
13
+ 0.0 # Since a1.length < a2.length, a1 must be empty as well.
33
14
  else
34
15
  if threshold
35
- if d = self.distance(s1, s2, (threshold*s2.length+1).to_i)
36
- d.to_f/s2.length
16
+ if d = self.distance(a1, a2, (threshold*a2.length+1).to_i)
17
+ d.to_f/a2.length
37
18
  else
38
19
  nil
39
20
  end
40
21
  else
41
- self.distance(s1, s2).to_f/s2.length
22
+ self.distance(a1, a2).to_f/a2.length
42
23
  end
43
24
  end
44
25
  end
@@ -53,47 +34,64 @@ module Levenshtein
53
34
  # The sequences should respond to :length and :[] and all objects
54
35
  # in the sequences (as returned by []) should response to :==.
55
36
 
56
- def self.distance(s1, s2, threshold=nil)
57
- s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
37
+ def self.distance(a1, a2, threshold=nil)
38
+ a1, a2 = a2, a1 if a1.length > a2.length # a1 is the short one; a2 is the long one.
58
39
 
59
40
  # Handle some basic circumstances.
60
41
 
61
- return 0 if s1 == s2
62
- return s2.length if s1.length == 0
42
+ return 0 if a1 == a2
43
+ return a2.length if a1.length == 0
63
44
 
64
45
  if threshold
65
- return nil if (s2.length-s1.length) >= threshold
46
+ return nil if (a2.length-a1.length) >= threshold
66
47
 
67
- a1, a2 = nil, nil
68
- a1, a2 = s1, s2 if s1.respond_to?(:-) and s2.respond_to?(:-)
69
- a1, a2 = s1.scan(/./), s2.scan(/./) if s1.respond_to?(:scan) and s2.respond_to?(:scan)
48
+ a3, a4 = nil, nil
49
+ a3, a4 = a1, a2 if a1.respond_to?(:-) and a2.respond_to?(:-)
50
+ a3, a4 = a1.scan(/./), a2.scan(/./) if a1.respond_to?(:scan) and a2.respond_to?(:scan)
70
51
 
71
- if a1 and a2
72
- return nil if (a1-a2).length >= threshold
73
- return nil if (a2-a1).length >= threshold
52
+ if a3 and a4
53
+ return nil if (a3-a4).length >= threshold
54
+ return nil if (a4-a3).length >= threshold
74
55
  end
75
56
  end
76
57
 
77
- distance_fast_or_slow(s1, s2, threshold)
58
+ distance_fast_or_slow(a1, a2, threshold)
78
59
  end
79
60
 
80
- def self.distance_fast_or_slow(s1, s2, threshold) # :nodoc:
81
- if respond_to?(:levenshtein_distance_fast)
82
- levenshtein_distance_fast(s1, s2, threshold) # Implemented in C.
61
+ def self.distance_fast_or_slow(a1, a2, threshold) # :nodoc:
62
+ if respond_to?(:distance_fast)
63
+ distance_fast(a1, a2, threshold) # Implemented in C.
83
64
  else
84
- levenshtein_distance_slow(s1, s2, threshold) # Implemented in Ruby.
65
+ distance_slow(a1, a2, threshold) # Implemented in Ruby.
85
66
  end
86
67
  end
87
68
 
88
- def self.levenshtein_distance_slow(s1, s2, threshold) # :nodoc:
89
- row = (0..s1.length).to_a
69
+ def self.distance_slow(a1, a2, threshold) # :nodoc:
70
+ l1 = a1.length
71
+ l2 = a2.length
90
72
 
91
- 1.upto(s2.length) do |y|
92
- prow = row
93
- row = [y]
73
+ offset = 0
74
+
75
+ while offset < l1 and offset < l2 and a1[offset] == a2[offset]
76
+ offset += 1
77
+ end
78
+
79
+ while offset < l1 and offset < l2 and a1[l1-1] == a2[l2-1]
80
+ l1 -= 1
81
+ l2 -= 1
82
+ end
83
+
84
+ l1 -= offset
85
+ l2 -= offset
94
86
 
95
- 1.upto(s1.length) do |x|
96
- row[x] = [prow[x]+1, row[x-1]+1, prow[x-1]+(s1[x-1]==s2[y-1] ? 0 : 1)].min
87
+ crow = (0..l1).to_a
88
+
89
+ 1.upto(l2) do |y|
90
+ prow = crow
91
+ crow = [y]
92
+
93
+ 1.upto(l1) do |x|
94
+ crow[x] = [prow[x]+1, crow[x-1]+1, prow[x-1]+(a1[offset+x-1]==a2[offset+y-1] ? 0 : 1)].min
97
95
  end
98
96
 
99
97
  # Stop analysing this sequence as soon as the best possible
@@ -101,9 +99,19 @@ module Levenshtein
101
99
  # (The minimum value in the next row will be equal to or greater
102
100
  # than the minimum value in this row.)
103
101
 
104
- return nil if threshold and row.min >= threshold
102
+ return nil if threshold and crow.min >= threshold
105
103
  end
106
104
 
107
- row[-1]
105
+ crow[-1]
106
+ end
107
+ end
108
+
109
+ begin
110
+ require "levenshtein/levenshtein_fast" # Compiled by RubyGems.
111
+ rescue LoadError
112
+ begin
113
+ require "levenshtein_fast" # Compiled by the build script.
114
+ rescue LoadError
115
+ $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein. Using the much slower Ruby version instead."
108
116
  end
109
117
  end
@@ -0,0 +1,4 @@
1
+ module Levenshtein
2
+ class LevenshteinException < RuntimeError
3
+ end
4
+ end
@@ -0,0 +1,3 @@
1
+ module Levenshtein
2
+ VERSION = "0.2.1"
3
+ end
@@ -12,6 +12,8 @@ module Levenshtein
12
12
  end
13
13
 
14
14
  def [](pos)
15
+ raise "type not allowed [#{pos.inspect}]" unless pos.kind_of?(Fixnum)
16
+
15
17
  @sequence[pos]
16
18
  end
17
19
  end
@@ -105,21 +107,31 @@ end
105
107
 
106
108
  class TestLevenshteinSlow < Test::Unit::TestCase
107
109
  def test_erik_veenstra
108
- assert_equal(7, Levenshtein.levenshtein_distance_slow("erik", "veenstra", nil))
110
+ assert_equal(7, Levenshtein.distance_slow("erik", "veenstra", nil))
111
+ assert_equal(7, Levenshtein.distance_slow("veenstra", "erik", nil))
109
112
  end
110
113
 
111
- def test_empty_sequence
112
- assert_equal(0, Levenshtein.levenshtein_distance_slow("", "", nil))
113
- assert_equal(3, Levenshtein.levenshtein_distance_slow("", "foo", nil))
114
+ def test_empty_string
115
+ assert_equal(0, Levenshtein.distance_slow("", "", nil))
116
+ assert_equal(3, Levenshtein.distance_slow("", "foo", nil))
117
+ assert_equal(3, Levenshtein.distance_slow("foo", "", nil))
114
118
  end
115
119
 
116
- def test_same_sequence
117
- assert_equal(0, Levenshtein.levenshtein_distance_slow("", "", nil))
118
- assert_equal(0, Levenshtein.levenshtein_distance_slow("foo", "foo", nil))
120
+ def test_same_string
121
+ assert_equal(0, Levenshtein.distance_slow("", "", nil))
122
+ assert_equal(0, Levenshtein.distance_slow("foo", "foo", nil))
119
123
  end
120
124
 
121
125
  def test_threshold
122
- assert_equal(3, Levenshtein.levenshtein_distance_slow("foo", "foobar", nil))
123
- assert_equal(nil, Levenshtein.levenshtein_distance_slow("foo", "foobar", 2))
126
+ assert_equal(3, Levenshtein.distance_slow("foo", "foobar", nil))
127
+ assert_equal(3, Levenshtein.distance_slow("foo", "foobar", 4))
128
+ assert_equal(nil, Levenshtein.distance_slow("foo", "foobar", 2))
129
+ end
130
+
131
+ def test_same_head_and_or_tail
132
+ assert_equal(3, Levenshtein.distance_slow("ab123cd", "abxyzcd", nil))
133
+ assert_equal(3, Levenshtein.distance_slow("ab123", "abxyz", nil))
134
+ assert_equal(3, Levenshtein.distance_slow("123cd", "xyzcd", nil))
135
+ assert_equal(5, Levenshtein.distance_slow("123cd123", "123", nil))
124
136
  end
125
137
  end
metadata CHANGED
@@ -1,7 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: levenshtein
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ hash: 21
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 2
9
+ - 1
10
+ version: 0.2.1
5
11
  platform: ruby
6
12
  authors:
7
13
  - Erik Veenstra
@@ -9,8 +15,7 @@ autorequire:
9
15
  bindir: bin
10
16
  cert_chain: []
11
17
 
12
- date: 2009-07-11 00:00:00 +02:00
13
- default_executable:
18
+ date: 2012-02-11 00:00:00 Z
14
19
  dependencies: []
15
20
 
16
21
  description: Calculates the Levenshtein distance between two byte strings.
@@ -22,18 +27,21 @@ extensions:
22
27
  extra_rdoc_files: []
23
28
 
24
29
  files:
30
+ - lib/levenshtein/exception.rb
31
+ - lib/levenshtein/version.rb
25
32
  - lib/levenshtein.rb
26
- - ext/levenshtein/extconf.rb
27
- - ext/levenshtein/levenshtein_array_of_strings.c
28
- - ext/levenshtein/levenshtein_fast.c
29
33
  - ext/levenshtein/levenshtein_string.c
30
34
  - ext/levenshtein/levenshtein_generic.c
35
+ - ext/levenshtein/levenshtein.h
36
+ - ext/levenshtein/levenshtein_fast.c
37
+ - ext/levenshtein/levenshtein_array_of_strings.c
31
38
  - ext/levenshtein/levenshtein_array.c
39
+ - ext/levenshtein/extconf.rb
32
40
  - README
33
41
  - LICENSE
34
42
  - VERSION
35
43
  - CHANGELOG
36
- has_rdoc: true
44
+ - test/test.rb
37
45
  homepage: http://www.erikveen.dds.nl/levenshtein/index.html
38
46
  licenses: []
39
47
 
@@ -44,27 +52,33 @@ rdoc_options:
44
52
  - VERSION
45
53
  - CHANGELOG
46
54
  - --title
47
- - levenshtein (0.2.0)
55
+ - levenshtein (0.2.1)
48
56
  - --main
49
57
  - README
50
58
  require_paths:
51
59
  - lib
52
60
  required_ruby_version: !ruby/object:Gem::Requirement
61
+ none: false
53
62
  requirements:
54
63
  - - ">="
55
64
  - !ruby/object:Gem::Version
65
+ hash: 3
66
+ segments:
67
+ - 0
56
68
  version: "0"
57
- version:
58
69
  required_rubygems_version: !ruby/object:Gem::Requirement
70
+ none: false
59
71
  requirements:
60
72
  - - ">="
61
73
  - !ruby/object:Gem::Version
74
+ hash: 3
75
+ segments:
76
+ - 0
62
77
  version: "0"
63
- version:
64
78
  requirements: []
65
79
 
66
80
  rubyforge_project: levenshtein
67
- rubygems_version: 1.3.4
81
+ rubygems_version: 1.8.12
68
82
  signing_key:
69
83
  specification_version: 3
70
84
  summary: Calculates the Levenshtein distance between two byte strings.