levenshtein 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,3 +1,11 @@
1
+ 0.2.1 (11-02-2012)
2
+
3
+ * Better memory handling.
4
+
5
+ * Little speed improvements.
6
+
7
+ * Ruby 1.9 compatible?
8
+
1
9
  0.2.0 (11-07-2009)
2
10
 
3
11
  * Return 0 instead of 0.0 in case of empty strings.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.0
1
+ 0.2.1
@@ -0,0 +1,13 @@
1
+ #ifdef RARRAY_PTR
2
+ #else
3
+ #define RARRAY_PTR(o) (RARRAY(o)->ptr)
4
+ #define RARRAY_LEN(o) (RARRAY(o)->len)
5
+ #endif
6
+
7
+ #ifdef RSTRING_PTR
8
+ #else
9
+ #define RSTRING_PTR(o) (RSTRING(o)->ptr)
10
+ #define RSTRING_LEN(o) (RSTRING(o)->len)
11
+ #endif
12
+
13
+ VALUE mLevenshtein;
@@ -1,19 +1,19 @@
1
1
  #include "ruby.h"
2
+ #include "levenshtein.h"
2
3
 
3
4
  VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
5
  int threshold;
5
6
  int l1, l2;
6
- int *prev_row, *curr_row;
7
+ int *prev_row, *curr_row, *temp_row;
7
8
  int col, row;
8
9
  int curr_row_min, result;
9
10
  int offset;
10
-
11
- ID id_eql = rb_intern("==");
11
+ int value1, value2;
12
12
 
13
13
  /* Get the sizes of both arrays. */
14
14
 
15
- l1 = RARRAY(rb_o1)->len;
16
- l2 = RARRAY(rb_o2)->len;
15
+ l1 = RARRAY_LEN(rb_o1);
16
+ l2 = RARRAY_LEN(rb_o2);
17
17
 
18
18
  /* Convert Ruby's threshold to C's threshold. */
19
19
 
@@ -26,13 +26,14 @@ VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_
26
26
  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
27
27
 
28
28
  offset = 0;
29
- while RTEST(rb_funcall(rb_ary_entry(rb_o1, offset), id_eql, 1, rb_ary_entry(rb_o2, offset))) {
29
+
30
+ while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)))) {
30
31
  offset++;
31
32
  }
32
33
 
33
34
  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
34
35
 
35
- while ((l1-1 > offset) && (l2-1 > offset) && RTEST(rb_funcall(rb_ary_entry(rb_o1, l1-1), id_eql, 1, rb_ary_entry(rb_o2, l2-1)))) {
36
+ while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)))) {
36
37
  l1--;
37
38
  l2--;
38
39
  }
@@ -57,12 +58,8 @@ VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_
57
58
 
58
59
  /* Allocate memory for both rows */
59
60
 
60
- prev_row = ALLOC_N(int, l1+1);
61
- curr_row = ALLOC_N(int, l1+1);
62
-
63
- if ((prev_row == NULL) || (curr_row == NULL)) {
64
- rb_raise(rb_eNoMemError, "out of memory");
65
- }
61
+ prev_row = (int*) ALLOC_N(int, (l1+1));
62
+ curr_row = (int*) ALLOC_N(int, (l1+1));
66
63
 
67
64
  /* Initialize the current row. */
68
65
 
@@ -73,7 +70,9 @@ VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_
73
70
  for (row=1; row<=l2; row++) {
74
71
  /* Copy the current row to the previous row. */
75
72
 
76
- memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
73
+ temp_row = prev_row;
74
+ prev_row = curr_row;
75
+ curr_row = temp_row;
77
76
 
78
77
  /* Calculate the values of the current row. */
79
78
 
@@ -83,25 +82,29 @@ VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_
83
82
  for (col=1; col<=l1; col++) {
84
83
  /* Equal (cost=0) or substitution (cost=1). */
85
84
 
86
- curr_row[col] = prev_row[col-1] + (RTEST(rb_funcall(rb_ary_entry(rb_o1, offset+col-1), id_eql, 1, rb_ary_entry(rb_o2, offset+row-1))) ? 0 : 1);
85
+ value1 = prev_row[col-1] + (RTEST(rb_equal(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1))) ? 0 : 1);
87
86
 
88
87
  /* Insertion if it's cheaper than substitution. */
89
88
 
90
- if (prev_row[col]+1 < curr_row[col]) {
91
- curr_row[col] = prev_row[col]+1;
89
+ value2 = prev_row[col]+1;
90
+ if (value2 < value1) {
91
+ value1 = value2;
92
92
  }
93
93
 
94
94
  /* Deletion if it's cheaper than substitution. */
95
95
 
96
- if (curr_row[col-1]+1 < curr_row[col]) {
97
- curr_row[col] = curr_row[col-1]+1;
96
+ value2 = curr_row[col-1]+1;
97
+ if (value2 < value1) {
98
+ value1 = value2;
98
99
  }
99
100
 
100
101
  /* Keep track of the minimum value on this row. */
101
102
 
102
- if (curr_row[col] < curr_row_min) {
103
- curr_row_min = curr_row[col];
103
+ if (value1 < curr_row_min) {
104
+ curr_row_min = value1;
104
105
  }
106
+
107
+ curr_row[col] = value1;
105
108
  }
106
109
 
107
110
  /* Return nil as soon as we exceed the threshold. */
@@ -1,17 +1,19 @@
1
1
  #include "ruby.h"
2
+ #include "levenshtein.h"
2
3
 
3
4
  VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
5
  int threshold;
5
6
  int l1, l2;
6
- int *prev_row, *curr_row;
7
+ int *prev_row, *curr_row, *temp_row;
7
8
  int col, row;
8
9
  int curr_row_min, result;
9
10
  int offset;
11
+ int value1, value2;
10
12
 
11
13
  /* Get the sizes of both arrays. */
12
14
 
13
- l1 = RARRAY(rb_o1)->len;
14
- l2 = RARRAY(rb_o2)->len;
15
+ l1 = RARRAY_LEN(rb_o1);
16
+ l2 = RARRAY_LEN(rb_o2);
15
17
 
16
18
  /* Convert Ruby's threshold to C's threshold. */
17
19
 
@@ -24,13 +26,14 @@ VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2
24
26
  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
25
27
 
26
28
  offset = 0;
27
- while (rb_str_cmp(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)) == 0) {
29
+
30
+ while ((offset < l1) && (offset < l2) && (rb_str_cmp(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)) == 0)) {
28
31
  offset++;
29
32
  }
30
33
 
31
34
  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
32
35
 
33
- while ((l1-1 > offset) && (l2-1 > offset) && (rb_str_cmp(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)) == 0 )) {
36
+ while ((offset < l1) && (offset < l2) && (rb_str_cmp(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)) == 0 )) {
34
37
  l1--;
35
38
  l2--;
36
39
  }
@@ -55,12 +58,8 @@ VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2
55
58
 
56
59
  /* Allocate memory for both rows */
57
60
 
58
- prev_row = ALLOC_N(int, l1+1);
59
- curr_row = ALLOC_N(int, l1+1);
60
-
61
- if ((prev_row == NULL) || (curr_row == NULL)) {
62
- rb_raise(rb_eNoMemError, "out of memory");
63
- }
61
+ prev_row = (int*) ALLOC_N(int, (l1+1));
62
+ curr_row = (int*) ALLOC_N(int, (l1+1));
64
63
 
65
64
  /* Initialize the current row. */
66
65
 
@@ -71,7 +70,9 @@ VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2
71
70
  for (row=1; row<=l2; row++) {
72
71
  /* Copy the current row to the previous row. */
73
72
 
74
- memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
73
+ temp_row = prev_row;
74
+ prev_row = curr_row;
75
+ curr_row = temp_row;
75
76
 
76
77
  /* Calculate the values of the current row. */
77
78
 
@@ -81,25 +82,29 @@ VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2
81
82
  for (col=1; col<=l1; col++) {
82
83
  /* Equal (cost=0) or substitution (cost=1). */
83
84
 
84
- curr_row[col] = prev_row[col-1] + ((rb_str_cmp(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1)) == 0) ? 0 : 1);
85
+ value1 = prev_row[col-1] + ((rb_str_cmp(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1)) == 0) ? 0 : 1);
85
86
 
86
87
  /* Insertion if it's cheaper than substitution. */
87
88
 
88
- if (prev_row[col]+1 < curr_row[col]) {
89
- curr_row[col] = prev_row[col]+1;
89
+ value2 = prev_row[col]+1;
90
+ if (value2 < value1) {
91
+ value1 = value2;
90
92
  }
91
93
 
92
94
  /* Deletion if it's cheaper than substitution. */
93
95
 
94
- if (curr_row[col-1]+1 < curr_row[col]) {
95
- curr_row[col] = curr_row[col-1]+1;
96
+ value2 = curr_row[col-1]+1;
97
+ if (value2 < value1) {
98
+ value1 = value2;
96
99
  }
97
100
 
98
101
  /* Keep track of the minimum value on this row. */
99
102
 
100
- if (curr_row[col] < curr_row_min) {
101
- curr_row_min = curr_row[col];
103
+ if (value1 < curr_row_min) {
104
+ curr_row_min = value1;
102
105
  }
106
+
107
+ curr_row[col] = value1;
103
108
  }
104
109
 
105
110
  /* Return nil as soon as we exceed the threshold. */
@@ -1,4 +1,5 @@
1
1
  #include "ruby.h"
2
+ #include "levenshtein.h"
2
3
 
3
4
  VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
5
  if ((TYPE(rb_o1) == T_STRING) && (TYPE(rb_o2)) == T_STRING) {
@@ -15,7 +16,7 @@ VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_t
15
16
  }
16
17
 
17
18
  void Init_levenshtein_fast() {
18
- VALUE mLevenshtein = rb_define_module("Levenshtein");
19
+ mLevenshtein = rb_const_get(rb_mKernel, rb_intern("Levenshtein"));
19
20
 
20
- rb_define_singleton_method(mLevenshtein, "levenshtein_distance_fast" , levenshtein_distance_fast, 3);
21
+ rb_define_singleton_method(mLevenshtein, "distance_fast" , levenshtein_distance_fast, 3);
21
22
  }
@@ -1,16 +1,17 @@
1
1
  #include "ruby.h"
2
+ #include "levenshtein.h"
2
3
 
3
4
  VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
5
  int threshold;
5
6
  int l1, l2;
6
- int *prev_row, *curr_row;
7
+ int *prev_row, *curr_row, *temp_row;
7
8
  int col, row;
8
9
  int curr_row_min, result;
9
10
  int offset;
11
+ int value1, value2;
10
12
 
11
- ID id_length = rb_intern("length");
12
- ID id_get = rb_intern("[]");
13
- ID id_equal = rb_intern("==");
13
+ ID id_length = rb_intern("length");
14
+ ID id_get = rb_intern("[]");
14
15
 
15
16
  /* Get the sizes of both sequences. */
16
17
 
@@ -28,13 +29,14 @@ VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE r
28
29
  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
29
30
 
30
31
  offset = 0;
31
- while RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(offset)))) {
32
+
33
+ while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset)), rb_funcall(rb_o2, id_get, 1, INT2FIX(offset))))) {
32
34
  offset++;
33
35
  }
34
36
 
35
37
  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
36
38
 
37
- while ((l1-1 > offset) && (l2-1 > offset) && RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(l1-1)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(l2-1))))) {
39
+ while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_funcall(rb_o1, id_get, 1, INT2FIX(l1-1)), rb_funcall(rb_o2, id_get, 1, INT2FIX(l2-1))))) {
38
40
  l1--;
39
41
  l2--;
40
42
  }
@@ -59,12 +61,8 @@ VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE r
59
61
 
60
62
  /* Allocate memory for both rows */
61
63
 
62
- prev_row = ALLOC_N(int, l1+1);
63
- curr_row = ALLOC_N(int, l1+1);
64
-
65
- if ((prev_row == NULL) || (curr_row == NULL)) {
66
- rb_raise(rb_eNoMemError, "out of memory");
67
- }
64
+ prev_row = (int*) ALLOC_N(int, (l1+1));
65
+ curr_row = (int*) ALLOC_N(int, (l1+1));
68
66
 
69
67
  /* Initialize the current row. */
70
68
 
@@ -75,7 +73,9 @@ VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE r
75
73
  for (row=1; row<=l2; row++) {
76
74
  /* Copy the current row to the previous row. */
77
75
 
78
- memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
76
+ temp_row = prev_row;
77
+ prev_row = curr_row;
78
+ curr_row = temp_row;
79
79
 
80
80
  /* Calculate the values of the current row. */
81
81
 
@@ -85,25 +85,29 @@ VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE r
85
85
  for (col=1; col<=l1; col++) {
86
86
  /* Equal (cost=0) or substitution (cost=1). */
87
87
 
88
- curr_row[col] = prev_row[col-1] + (RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset+col-1)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(offset+row-1)))) ? 0 : 1);
88
+ value1 = prev_row[col-1] + (RTEST(rb_equal(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset+col-1)), rb_funcall(rb_o2, id_get, 1, INT2FIX(offset+row-1)))) ? 0 : 1);
89
89
 
90
90
  /* Insertion if it's cheaper than substitution. */
91
91
 
92
- if (prev_row[col]+1 < curr_row[col]) {
93
- curr_row[col] = prev_row[col]+1;
92
+ value2 = prev_row[col]+1;
93
+ if (value2 < value1) {
94
+ value1 = value2;
94
95
  }
95
96
 
96
97
  /* Deletion if it's cheaper than substitution. */
97
98
 
98
- if (curr_row[col-1]+1 < curr_row[col]) {
99
- curr_row[col] = curr_row[col-1]+1;
99
+ value2 = curr_row[col-1]+1;
100
+ if (value2 < value1) {
101
+ value1 = value2;
100
102
  }
101
103
 
102
104
  /* Keep track of the minimum value on this row. */
103
105
 
104
- if (curr_row[col] < curr_row_min) {
105
- curr_row_min = curr_row[col];
106
+ if (value1 < curr_row_min) {
107
+ curr_row_min = value1;
106
108
  }
109
+
110
+ curr_row[col] = value1;
107
111
  }
108
112
 
109
113
  /* Return nil as soon as we exceed the threshold. */
@@ -1,25 +1,27 @@
1
1
  #include "ruby.h"
2
+ #include "levenshtein.h"
2
3
 
3
4
  VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
5
  int threshold;
5
6
  int l1, l2;
6
- int *prev_row, *curr_row;
7
+ int *prev_row, *curr_row, *temp_row;
7
8
  int col, row;
8
9
  int curr_row_min, result;
9
10
  int offset;
11
+ int value1, value2;
10
12
  char *s1, *s2;
11
13
 
12
14
  /* Convert Ruby's s1 to C's s1. */
13
15
 
14
16
  rb_o1 = StringValue(rb_o1);
15
- s1 = RSTRING(rb_o1)->ptr;
16
- l1 = RSTRING(rb_o1)->len;
17
+ s1 = RSTRING_PTR(rb_o1);
18
+ l1 = RSTRING_LEN(rb_o1);
17
19
 
18
20
  /* Convert Ruby's s2 to C's s2. */
19
21
 
20
22
  rb_o2 = StringValue(rb_o2);
21
- s2 = RSTRING(rb_o2)->ptr;
22
- l2 = RSTRING(rb_o2)->len;
23
+ s2 = RSTRING_PTR(rb_o2);
24
+ l2 = RSTRING_LEN(rb_o2);
23
25
 
24
26
  /* Convert Ruby's threshold to C's threshold. */
25
27
 
@@ -32,13 +34,14 @@ VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb
32
34
  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
33
35
 
34
36
  offset = 0;
35
- while (s1[offset] == s2[offset]) {
37
+
38
+ while ((offset < l1) && (offset < l2) && (s1[offset] == s2[offset])) {
36
39
  offset++;
37
40
  }
38
41
 
39
42
  /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
40
43
 
41
- while ((l1-1 > offset) && (l2-1 > offset) && (s1[l1-1] == s2[l2-1])) {
44
+ while ((offset < l1) && (offset < l2) && (s1[l1-1] == s2[l2-1])) {
42
45
  l1--;
43
46
  l2--;
44
47
  }
@@ -63,12 +66,8 @@ VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb
63
66
 
64
67
  /* Allocate memory for both rows */
65
68
 
66
- prev_row = ALLOC_N(int, l1+1);
67
- curr_row = ALLOC_N(int, l1+1);
68
-
69
- if ((prev_row == NULL) || (curr_row == NULL)) {
70
- rb_raise(rb_eNoMemError, "out of memory");
71
- }
69
+ prev_row = (int*) ALLOC_N(int, (l1+1));
70
+ curr_row = (int*) ALLOC_N(int, (l1+1));
72
71
 
73
72
  /* Initialize the current row. */
74
73
 
@@ -79,7 +78,9 @@ VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb
79
78
  for (row=1; row<=l2; row++) {
80
79
  /* Copy the current row to the previous row. */
81
80
 
82
- memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
81
+ temp_row = prev_row;
82
+ prev_row = curr_row;
83
+ curr_row = temp_row;
83
84
 
84
85
  /* Calculate the values of the current row. */
85
86
 
@@ -89,25 +90,29 @@ VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb
89
90
  for (col=1; col<=l1; col++) {
90
91
  /* Equal (cost=0) or substitution (cost=1). */
91
92
 
92
- curr_row[col] = prev_row[col-1] + ((s1[offset+col-1] == s2[offset+row-1]) ? 0 : 1);
93
+ value1 = prev_row[col-1] + ((s1[offset+col-1] == s2[offset+row-1]) ? 0 : 1);
93
94
 
94
95
  /* Insertion if it's cheaper than substitution. */
95
96
 
96
- if (prev_row[col]+1 < curr_row[col]) {
97
- curr_row[col] = prev_row[col]+1;
97
+ value2 = prev_row[col]+1;
98
+ if (value2 < value1) {
99
+ value1 = value2;
98
100
  }
99
101
 
100
102
  /* Deletion if it's cheaper than substitution. */
101
103
 
102
- if (curr_row[col-1]+1 < curr_row[col]) {
103
- curr_row[col] = curr_row[col-1]+1;
104
+ value2 = curr_row[col-1]+1;
105
+ if (value2 < value1) {
106
+ value1 = value2;
104
107
  }
105
108
 
106
109
  /* Keep track of the minimum value on this row. */
107
110
 
108
- if (curr_row[col] < curr_row_min) {
109
- curr_row_min = curr_row[col];
111
+ if (value1 < curr_row_min) {
112
+ curr_row_min = value1;
110
113
  }
114
+
115
+ curr_row[col] = value1;
111
116
  }
112
117
 
113
118
  /* Return nil as soon as we exceed the threshold. */
@@ -1,44 +1,25 @@
1
- begin
2
- require "levenshtein/levenshtein_fast" # If compiled by RubyGems.
3
- rescue LoadError
4
- begin
5
- require "levenshtein_fast" # If compiled by the build script.
6
- rescue LoadError
7
- $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.distance. Using the much slower Ruby version instead."
8
- end
9
- end
10
-
11
- # The Levenshtein distance is a metric for measuring the amount
12
- # of difference between two sequences (i.e., the so called edit
13
- # distance). The Levenshtein distance between two sequences is
14
- # given by the minimum number of operations needed to transform
15
- # one sequence into the other, where an operation is an
16
- # insertion, deletion, or substitution of a single element.
17
- #
18
- # More information about the Levenshtein distance algorithm:
19
- # http://en.wikipedia.org/wiki/Levenshtein_distance .
1
+ require "levenshtein/exception"
2
+ require "levenshtein/version"
20
3
 
21
4
  module Levenshtein
22
- VERSION = "0.2.0"
23
-
24
5
  # Returns the Levenshtein distance as a number between 0.0 and
25
6
  # 1.0. It's basically the Levenshtein distance divided by the
26
7
  # length of the longest sequence.
27
8
 
28
- def self.normalized_distance(s1, s2, threshold=nil)
29
- s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
9
+ def self.normalized_distance(a1, a2, threshold=nil)
10
+ a1, a2 = a2, a1 if a1.length > a2.length # a1 is the short one; a2 is the long one.
30
11
 
31
- if s2.length == 0
32
- 0.0 # Since s1.length < s2.length, s1 must be empty as well.
12
+ if a2.length == 0
13
+ 0.0 # Since a1.length < a2.length, a1 must be empty as well.
33
14
  else
34
15
  if threshold
35
- if d = self.distance(s1, s2, (threshold*s2.length+1).to_i)
36
- d.to_f/s2.length
16
+ if d = self.distance(a1, a2, (threshold*a2.length+1).to_i)
17
+ d.to_f/a2.length
37
18
  else
38
19
  nil
39
20
  end
40
21
  else
41
- self.distance(s1, s2).to_f/s2.length
22
+ self.distance(a1, a2).to_f/a2.length
42
23
  end
43
24
  end
44
25
  end
@@ -53,47 +34,64 @@ module Levenshtein
53
34
  # The sequences should respond to :length and :[] and all objects
54
35
  # in the sequences (as returned by []) should response to :==.
55
36
 
56
- def self.distance(s1, s2, threshold=nil)
57
- s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
37
+ def self.distance(a1, a2, threshold=nil)
38
+ a1, a2 = a2, a1 if a1.length > a2.length # a1 is the short one; a2 is the long one.
58
39
 
59
40
  # Handle some basic circumstances.
60
41
 
61
- return 0 if s1 == s2
62
- return s2.length if s1.length == 0
42
+ return 0 if a1 == a2
43
+ return a2.length if a1.length == 0
63
44
 
64
45
  if threshold
65
- return nil if (s2.length-s1.length) >= threshold
46
+ return nil if (a2.length-a1.length) >= threshold
66
47
 
67
- a1, a2 = nil, nil
68
- a1, a2 = s1, s2 if s1.respond_to?(:-) and s2.respond_to?(:-)
69
- a1, a2 = s1.scan(/./), s2.scan(/./) if s1.respond_to?(:scan) and s2.respond_to?(:scan)
48
+ a3, a4 = nil, nil
49
+ a3, a4 = a1, a2 if a1.respond_to?(:-) and a2.respond_to?(:-)
50
+ a3, a4 = a1.scan(/./), a2.scan(/./) if a1.respond_to?(:scan) and a2.respond_to?(:scan)
70
51
 
71
- if a1 and a2
72
- return nil if (a1-a2).length >= threshold
73
- return nil if (a2-a1).length >= threshold
52
+ if a3 and a4
53
+ return nil if (a3-a4).length >= threshold
54
+ return nil if (a4-a3).length >= threshold
74
55
  end
75
56
  end
76
57
 
77
- distance_fast_or_slow(s1, s2, threshold)
58
+ distance_fast_or_slow(a1, a2, threshold)
78
59
  end
79
60
 
80
- def self.distance_fast_or_slow(s1, s2, threshold) # :nodoc:
81
- if respond_to?(:levenshtein_distance_fast)
82
- levenshtein_distance_fast(s1, s2, threshold) # Implemented in C.
61
+ def self.distance_fast_or_slow(a1, a2, threshold) # :nodoc:
62
+ if respond_to?(:distance_fast)
63
+ distance_fast(a1, a2, threshold) # Implemented in C.
83
64
  else
84
- levenshtein_distance_slow(s1, s2, threshold) # Implemented in Ruby.
65
+ distance_slow(a1, a2, threshold) # Implemented in Ruby.
85
66
  end
86
67
  end
87
68
 
88
- def self.levenshtein_distance_slow(s1, s2, threshold) # :nodoc:
89
- row = (0..s1.length).to_a
69
+ def self.distance_slow(a1, a2, threshold) # :nodoc:
70
+ l1 = a1.length
71
+ l2 = a2.length
90
72
 
91
- 1.upto(s2.length) do |y|
92
- prow = row
93
- row = [y]
73
+ offset = 0
74
+
75
+ while offset < l1 and offset < l2 and a1[offset] == a2[offset]
76
+ offset += 1
77
+ end
78
+
79
+ while offset < l1 and offset < l2 and a1[l1-1] == a2[l2-1]
80
+ l1 -= 1
81
+ l2 -= 1
82
+ end
83
+
84
+ l1 -= offset
85
+ l2 -= offset
94
86
 
95
- 1.upto(s1.length) do |x|
96
- row[x] = [prow[x]+1, row[x-1]+1, prow[x-1]+(s1[x-1]==s2[y-1] ? 0 : 1)].min
87
+ crow = (0..l1).to_a
88
+
89
+ 1.upto(l2) do |y|
90
+ prow = crow
91
+ crow = [y]
92
+
93
+ 1.upto(l1) do |x|
94
+ crow[x] = [prow[x]+1, crow[x-1]+1, prow[x-1]+(a1[offset+x-1]==a2[offset+y-1] ? 0 : 1)].min
97
95
  end
98
96
 
99
97
  # Stop analysing this sequence as soon as the best possible
@@ -101,9 +99,19 @@ module Levenshtein
101
99
  # (The minimum value in the next row will be equal to or greater
102
100
  # than the minimum value in this row.)
103
101
 
104
- return nil if threshold and row.min >= threshold
102
+ return nil if threshold and crow.min >= threshold
105
103
  end
106
104
 
107
- row[-1]
105
+ crow[-1]
106
+ end
107
+ end
108
+
109
+ begin
110
+ require "levenshtein/levenshtein_fast" # Compiled by RubyGems.
111
+ rescue LoadError
112
+ begin
113
+ require "levenshtein_fast" # Compiled by the build script.
114
+ rescue LoadError
115
+ $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein. Using the much slower Ruby version instead."
108
116
  end
109
117
  end
@@ -0,0 +1,4 @@
1
+ module Levenshtein
2
+ class LevenshteinException < RuntimeError
3
+ end
4
+ end
@@ -0,0 +1,3 @@
1
+ module Levenshtein
2
+ VERSION = "0.2.1"
3
+ end
@@ -12,6 +12,8 @@ module Levenshtein
12
12
  end
13
13
 
14
14
  def [](pos)
15
+ raise "type not allowed [#{pos.inspect}]" unless pos.kind_of?(Fixnum)
16
+
15
17
  @sequence[pos]
16
18
  end
17
19
  end
@@ -105,21 +107,31 @@ end
105
107
 
106
108
  class TestLevenshteinSlow < Test::Unit::TestCase
107
109
  def test_erik_veenstra
108
- assert_equal(7, Levenshtein.levenshtein_distance_slow("erik", "veenstra", nil))
110
+ assert_equal(7, Levenshtein.distance_slow("erik", "veenstra", nil))
111
+ assert_equal(7, Levenshtein.distance_slow("veenstra", "erik", nil))
109
112
  end
110
113
 
111
- def test_empty_sequence
112
- assert_equal(0, Levenshtein.levenshtein_distance_slow("", "", nil))
113
- assert_equal(3, Levenshtein.levenshtein_distance_slow("", "foo", nil))
114
+ def test_empty_string
115
+ assert_equal(0, Levenshtein.distance_slow("", "", nil))
116
+ assert_equal(3, Levenshtein.distance_slow("", "foo", nil))
117
+ assert_equal(3, Levenshtein.distance_slow("foo", "", nil))
114
118
  end
115
119
 
116
- def test_same_sequence
117
- assert_equal(0, Levenshtein.levenshtein_distance_slow("", "", nil))
118
- assert_equal(0, Levenshtein.levenshtein_distance_slow("foo", "foo", nil))
120
+ def test_same_string
121
+ assert_equal(0, Levenshtein.distance_slow("", "", nil))
122
+ assert_equal(0, Levenshtein.distance_slow("foo", "foo", nil))
119
123
  end
120
124
 
121
125
  def test_threshold
122
- assert_equal(3, Levenshtein.levenshtein_distance_slow("foo", "foobar", nil))
123
- assert_equal(nil, Levenshtein.levenshtein_distance_slow("foo", "foobar", 2))
126
+ assert_equal(3, Levenshtein.distance_slow("foo", "foobar", nil))
127
+ assert_equal(3, Levenshtein.distance_slow("foo", "foobar", 4))
128
+ assert_equal(nil, Levenshtein.distance_slow("foo", "foobar", 2))
129
+ end
130
+
131
+ def test_same_head_and_or_tail
132
+ assert_equal(3, Levenshtein.distance_slow("ab123cd", "abxyzcd", nil))
133
+ assert_equal(3, Levenshtein.distance_slow("ab123", "abxyz", nil))
134
+ assert_equal(3, Levenshtein.distance_slow("123cd", "xyzcd", nil))
135
+ assert_equal(5, Levenshtein.distance_slow("123cd123", "123", nil))
124
136
  end
125
137
  end
metadata CHANGED
@@ -1,7 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: levenshtein
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ hash: 21
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 2
9
+ - 1
10
+ version: 0.2.1
5
11
  platform: ruby
6
12
  authors:
7
13
  - Erik Veenstra
@@ -9,8 +15,7 @@ autorequire:
9
15
  bindir: bin
10
16
  cert_chain: []
11
17
 
12
- date: 2009-07-11 00:00:00 +02:00
13
- default_executable:
18
+ date: 2012-02-11 00:00:00 Z
14
19
  dependencies: []
15
20
 
16
21
  description: Calculates the Levenshtein distance between two byte strings.
@@ -22,18 +27,21 @@ extensions:
22
27
  extra_rdoc_files: []
23
28
 
24
29
  files:
30
+ - lib/levenshtein/exception.rb
31
+ - lib/levenshtein/version.rb
25
32
  - lib/levenshtein.rb
26
- - ext/levenshtein/extconf.rb
27
- - ext/levenshtein/levenshtein_array_of_strings.c
28
- - ext/levenshtein/levenshtein_fast.c
29
33
  - ext/levenshtein/levenshtein_string.c
30
34
  - ext/levenshtein/levenshtein_generic.c
35
+ - ext/levenshtein/levenshtein.h
36
+ - ext/levenshtein/levenshtein_fast.c
37
+ - ext/levenshtein/levenshtein_array_of_strings.c
31
38
  - ext/levenshtein/levenshtein_array.c
39
+ - ext/levenshtein/extconf.rb
32
40
  - README
33
41
  - LICENSE
34
42
  - VERSION
35
43
  - CHANGELOG
36
- has_rdoc: true
44
+ - test/test.rb
37
45
  homepage: http://www.erikveen.dds.nl/levenshtein/index.html
38
46
  licenses: []
39
47
 
@@ -44,27 +52,33 @@ rdoc_options:
44
52
  - VERSION
45
53
  - CHANGELOG
46
54
  - --title
47
- - levenshtein (0.2.0)
55
+ - levenshtein (0.2.1)
48
56
  - --main
49
57
  - README
50
58
  require_paths:
51
59
  - lib
52
60
  required_ruby_version: !ruby/object:Gem::Requirement
61
+ none: false
53
62
  requirements:
54
63
  - - ">="
55
64
  - !ruby/object:Gem::Version
65
+ hash: 3
66
+ segments:
67
+ - 0
56
68
  version: "0"
57
- version:
58
69
  required_rubygems_version: !ruby/object:Gem::Requirement
70
+ none: false
59
71
  requirements:
60
72
  - - ">="
61
73
  - !ruby/object:Gem::Version
74
+ hash: 3
75
+ segments:
76
+ - 0
62
77
  version: "0"
63
- version:
64
78
  requirements: []
65
79
 
66
80
  rubyforge_project: levenshtein
67
- rubygems_version: 1.3.4
81
+ rubygems_version: 1.8.12
68
82
  signing_key:
69
83
  specification_version: 3
70
84
  summary: Calculates the Levenshtein distance between two byte strings.