levenshtein-19 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
5
+ tmp/*
data/CHANGELOG ADDED
@@ -0,0 +1,26 @@
1
+ 0.3.0 (11-20-2011)
2
+
3
+ * Use RARRAY_LEN, RSTRING_LEN, and RSTRING_PTR for 1.9 compatibility.
4
+
5
+ 0.2.0 (11-07-2009)
6
+
7
+ * Return 0 instead of 0.0 in case of empty strings.
8
+
9
+ * Added specific support for arrays.
10
+
11
+ * Added specific support for arrays of strings.
12
+
13
+ * Added generic support for all (?) kind of sequences.
14
+
15
+ * Moved a lot of code to the C world.
16
+
17
+ 0.1.1 (06-10-2008)
18
+
19
+ * If one of the strings was both the begin and the end of the
20
+ other string, it would be stripped from both ends. Example:
21
+ Levenshtein.distance("abracadabra", "abra") resulted in 3
22
+ instead of 7. It's fixed now.
23
+
24
+ 0.1.0 (24-05-2008)
25
+
26
+ * First release.
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in levenshtein-19.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,15 @@
1
+ # Copyright Erik Veenstra <levenshtein@erikveen.dds.nl>
2
+ #
3
+ # This program is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU General Public License,
5
+ # version 2, as published by the Free Software Foundation.
6
+ #
7
+ # This program is distributed in the hope that it will be
8
+ # useful, but WITHOUT ANY WARRANTY; without even the implied
9
+ # warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
10
+ # PURPOSE. See the GNU General Public License for more details.
11
+ #
12
+ # You should have received a copy of the GNU General Public
13
+ # License along with this program; if not, write to the Free
14
+ # Software Foundation, Inc., 59 Temple Place, Suite 330,
15
+ # Boston, MA 02111-1307 USA.
data/README ADDED
@@ -0,0 +1,15 @@
1
+ The Levenshtein distance is a metric for measuring the amount of difference
2
+ between two sequences (i.e., the so called edit distance). The Levenshtein
3
+ distance between two sequences is given by the minimum number of operations
4
+ needed to transform one sequence into the other, where an operation is an
5
+ insertion, deletion, or substitution of a single element.
6
+
7
+ The two sequences can be two strings, two arrays, or two other objects.
8
+ Strings, arrays and arrays of strings are handled with optimized (very fast) C
9
+ code. All other sequences are handled with generic (fast) C code.
10
+
11
+ More information about the Levenshtein distance algorithm:
12
+ http://en.wikipedia.org/wiki/Levenshtein_distance .
13
+
14
+ NOTE: This gem was written by Erik Veenstra. I have made slight modifications
15
+ to it for compatibility with Ruby 1.9.
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,10 @@
1
+ require "mkmf"
2
+
3
+ dir_config("levenshtein")
4
+
5
+ have_library("levenshtein_array")
6
+ have_library("levenshtein_array_of_strings")
7
+ have_library("levenshtein_generic")
8
+ have_library("levenshtein_string")
9
+
10
+ create_makefile("levenshtein/levenshtein_fast")
@@ -0,0 +1,127 @@
1
+ #include "ruby.h"
2
+
3
+ VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
+ int threshold;
5
+ int l1, l2;
6
+ int *prev_row, *curr_row;
7
+ int col, row;
8
+ int curr_row_min, result;
9
+ int offset;
10
+
11
+ ID id_eql = rb_intern("==");
12
+
13
+ /* Get the sizes of both arrays. */
14
+
15
+ l1 = RARRAY_LEN(rb_o1);
16
+ l2 = RARRAY_LEN(rb_o2);
17
+
18
+ /* Convert Ruby's threshold to C's threshold. */
19
+
20
+ if (!NIL_P(rb_threshold)) {
21
+ threshold = FIX2INT(rb_threshold);
22
+ } else {
23
+ threshold = -1;
24
+ }
25
+
26
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
27
+
28
+ offset = 0;
29
+ while RTEST(rb_funcall(rb_ary_entry(rb_o1, offset), id_eql, 1, rb_ary_entry(rb_o2, offset))) {
30
+ offset++;
31
+ }
32
+
33
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
34
+
35
+ while ((l1-1 > offset) && (l2-1 > offset) && RTEST(rb_funcall(rb_ary_entry(rb_o1, l1-1), id_eql, 1, rb_ary_entry(rb_o2, l2-1)))) {
36
+ l1--;
37
+ l2--;
38
+ }
39
+
40
+ l1 -= offset;
41
+ l2 -= offset;
42
+
43
+ /* The Levenshtein algorithm itself. */
44
+
45
+ /* s1= */
46
+ /* ERIK */
47
+ /* */
48
+ /* 01234 */
49
+ /* s2=V 11234 */
50
+ /* E 21234 */
51
+ /* E 32234 */
52
+ /* N 43334 <- prev_row */
53
+ /* S 54444 <- curr_row */
54
+ /* T 65555 */
55
+ /* R 76566 */
56
+ /* A 87667 */
57
+
58
+ /* Allocate memory for both rows */
59
+
60
+ prev_row = ALLOC_N(int, l1+1);
61
+ curr_row = ALLOC_N(int, l1+1);
62
+
63
+ if ((prev_row == NULL) || (curr_row == NULL)) {
64
+ rb_raise(rb_eNoMemError, "out of memory");
65
+ }
66
+
67
+ /* Initialize the current row. */
68
+
69
+ for (col=0; col<=l1; col++) {
70
+ curr_row[col] = col;
71
+ }
72
+
73
+ for (row=1; row<=l2; row++) {
74
+ /* Copy the current row to the previous row. */
75
+
76
+ memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
77
+
78
+ /* Calculate the values of the current row. */
79
+
80
+ curr_row[0] = row;
81
+ curr_row_min = row;
82
+
83
+ for (col=1; col<=l1; col++) {
84
+ /* Equal (cost=0) or substitution (cost=1). */
85
+
86
+ curr_row[col] = prev_row[col-1] + (RTEST(rb_funcall(rb_ary_entry(rb_o1, offset+col-1), id_eql, 1, rb_ary_entry(rb_o2, offset+row-1))) ? 0 : 1);
87
+
88
+ /* Insertion if it's cheaper than substitution. */
89
+
90
+ if (prev_row[col]+1 < curr_row[col]) {
91
+ curr_row[col] = prev_row[col]+1;
92
+ }
93
+
94
+ /* Deletion if it's cheaper than substitution. */
95
+
96
+ if (curr_row[col-1]+1 < curr_row[col]) {
97
+ curr_row[col] = curr_row[col-1]+1;
98
+ }
99
+
100
+ /* Keep track of the minimum value on this row. */
101
+
102
+ if (curr_row[col] < curr_row_min) {
103
+ curr_row_min = curr_row[col];
104
+ }
105
+ }
106
+
107
+ /* Return nil as soon as we exceed the threshold. */
108
+
109
+ if (threshold > -1 && curr_row_min >= threshold) {
110
+ free(prev_row);
111
+ free(curr_row);
112
+
113
+ return Qnil;
114
+ }
115
+ }
116
+
117
+ /* The result is the last value on the last row. */
118
+
119
+ result = curr_row[l1];
120
+
121
+ free(prev_row);
122
+ free(curr_row);
123
+
124
+ /* Return the Ruby version of the result. */
125
+
126
+ return INT2FIX(result);
127
+ }
@@ -0,0 +1,125 @@
1
+ #include "ruby.h"
2
+
3
+ VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
+ int threshold;
5
+ int l1, l2;
6
+ int *prev_row, *curr_row;
7
+ int col, row;
8
+ int curr_row_min, result;
9
+ int offset;
10
+
11
+ /* Get the sizes of both arrays. */
12
+
13
+ l1 = RARRAY_LEN(rb_o1);
14
+ l2 = RARRAY_LEN(rb_o2);
15
+
16
+ /* Convert Ruby's threshold to C's threshold. */
17
+
18
+ if (!NIL_P(rb_threshold)) {
19
+ threshold = FIX2INT(rb_threshold);
20
+ } else {
21
+ threshold = -1;
22
+ }
23
+
24
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
25
+
26
+ offset = 0;
27
+ while (rb_str_cmp(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)) == 0) {
28
+ offset++;
29
+ }
30
+
31
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
32
+
33
+ while ((l1-1 > offset) && (l2-1 > offset) && (rb_str_cmp(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)) == 0 )) {
34
+ l1--;
35
+ l2--;
36
+ }
37
+
38
+ l1 -= offset;
39
+ l2 -= offset;
40
+
41
+ /* The Levenshtein algorithm itself. */
42
+
43
+ /* s1= */
44
+ /* ERIK */
45
+ /* */
46
+ /* 01234 */
47
+ /* s2=V 11234 */
48
+ /* E 21234 */
49
+ /* E 32234 */
50
+ /* N 43334 <- prev_row */
51
+ /* S 54444 <- curr_row */
52
+ /* T 65555 */
53
+ /* R 76566 */
54
+ /* A 87667 */
55
+
56
+ /* Allocate memory for both rows */
57
+
58
+ prev_row = ALLOC_N(int, l1+1);
59
+ curr_row = ALLOC_N(int, l1+1);
60
+
61
+ if ((prev_row == NULL) || (curr_row == NULL)) {
62
+ rb_raise(rb_eNoMemError, "out of memory");
63
+ }
64
+
65
+ /* Initialize the current row. */
66
+
67
+ for (col=0; col<=l1; col++) {
68
+ curr_row[col] = col;
69
+ }
70
+
71
+ for (row=1; row<=l2; row++) {
72
+ /* Copy the current row to the previous row. */
73
+
74
+ memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
75
+
76
+ /* Calculate the values of the current row. */
77
+
78
+ curr_row[0] = row;
79
+ curr_row_min = row;
80
+
81
+ for (col=1; col<=l1; col++) {
82
+ /* Equal (cost=0) or substitution (cost=1). */
83
+
84
+ curr_row[col] = prev_row[col-1] + ((rb_str_cmp(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1)) == 0) ? 0 : 1);
85
+
86
+ /* Insertion if it's cheaper than substitution. */
87
+
88
+ if (prev_row[col]+1 < curr_row[col]) {
89
+ curr_row[col] = prev_row[col]+1;
90
+ }
91
+
92
+ /* Deletion if it's cheaper than substitution. */
93
+
94
+ if (curr_row[col-1]+1 < curr_row[col]) {
95
+ curr_row[col] = curr_row[col-1]+1;
96
+ }
97
+
98
+ /* Keep track of the minimum value on this row. */
99
+
100
+ if (curr_row[col] < curr_row_min) {
101
+ curr_row_min = curr_row[col];
102
+ }
103
+ }
104
+
105
+ /* Return nil as soon as we exceed the threshold. */
106
+
107
+ if (threshold > -1 && curr_row_min >= threshold) {
108
+ free(prev_row);
109
+ free(curr_row);
110
+
111
+ return Qnil;
112
+ }
113
+ }
114
+
115
+ /* The result is the last value on the last row. */
116
+
117
+ result = curr_row[l1];
118
+
119
+ free(prev_row);
120
+ free(curr_row);
121
+
122
+ /* Return the Ruby version of the result. */
123
+
124
+ return INT2FIX(result);
125
+ }
@@ -0,0 +1,21 @@
1
+ #include "ruby.h"
2
+
3
+ VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
+ if ((TYPE(rb_o1) == T_STRING) && (TYPE(rb_o2)) == T_STRING) {
5
+ return levenshtein_distance_string(self, rb_o1, rb_o2, rb_threshold);
6
+ } else if ((TYPE(rb_o1) == T_ARRAY) && (TYPE(rb_o2)) == T_ARRAY) {
7
+ if ((TYPE(rb_ary_entry(rb_o1, 0)) == T_STRING) && (TYPE(rb_ary_entry(rb_o2, 0))) == T_STRING) {
8
+ return levenshtein_distance_array_of_strings(self, rb_o1, rb_o2, rb_threshold);
9
+ } else {
10
+ return levenshtein_distance_array(self, rb_o1, rb_o2, rb_threshold);
11
+ }
12
+ } else {
13
+ return levenshtein_distance_generic(self, rb_o1, rb_o2, rb_threshold);
14
+ }
15
+ }
16
+
17
+ void Init_levenshtein_fast() {
18
+ VALUE mLevenshtein = rb_define_module("Levenshtein");
19
+
20
+ rb_define_singleton_method(mLevenshtein, "levenshtein_distance_fast" , levenshtein_distance_fast, 3);
21
+ }
@@ -0,0 +1,129 @@
1
+ #include "ruby.h"
2
+
3
+ VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
+ int threshold;
5
+ int l1, l2;
6
+ int *prev_row, *curr_row;
7
+ int col, row;
8
+ int curr_row_min, result;
9
+ int offset;
10
+
11
+ ID id_length = rb_intern("length");
12
+ ID id_get = rb_intern("[]");
13
+ ID id_equal = rb_intern("==");
14
+
15
+ /* Get the sizes of both sequences. */
16
+
17
+ l1 = FIX2INT(rb_funcall(rb_o1, id_length, 0));
18
+ l2 = FIX2INT(rb_funcall(rb_o2, id_length, 0));
19
+
20
+ /* Convert Ruby's threshold to C's threshold. */
21
+
22
+ if (!NIL_P(rb_threshold)) {
23
+ threshold = FIX2INT(rb_threshold);
24
+ } else {
25
+ threshold = -1;
26
+ }
27
+
28
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
29
+
30
+ offset = 0;
31
+ while RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(offset)))) {
32
+ offset++;
33
+ }
34
+
35
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
36
+
37
+ while ((l1-1 > offset) && (l2-1 > offset) && RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(l1-1)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(l2-1))))) {
38
+ l1--;
39
+ l2--;
40
+ }
41
+
42
+ l1 -= offset;
43
+ l2 -= offset;
44
+
45
+ /* The Levenshtein algorithm itself. */
46
+
47
+ /* s1= */
48
+ /* ERIK */
49
+ /* */
50
+ /* 01234 */
51
+ /* s2=V 11234 */
52
+ /* E 21234 */
53
+ /* E 32234 */
54
+ /* N 43334 <- prev_row */
55
+ /* S 54444 <- curr_row */
56
+ /* T 65555 */
57
+ /* R 76566 */
58
+ /* A 87667 */
59
+
60
+ /* Allocate memory for both rows */
61
+
62
+ prev_row = ALLOC_N(int, l1+1);
63
+ curr_row = ALLOC_N(int, l1+1);
64
+
65
+ if ((prev_row == NULL) || (curr_row == NULL)) {
66
+ rb_raise(rb_eNoMemError, "out of memory");
67
+ }
68
+
69
+ /* Initialize the current row. */
70
+
71
+ for (col=0; col<=l1; col++) {
72
+ curr_row[col] = col;
73
+ }
74
+
75
+ for (row=1; row<=l2; row++) {
76
+ /* Copy the current row to the previous row. */
77
+
78
+ memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
79
+
80
+ /* Calculate the values of the current row. */
81
+
82
+ curr_row[0] = row;
83
+ curr_row_min = row;
84
+
85
+ for (col=1; col<=l1; col++) {
86
+ /* Equal (cost=0) or substitution (cost=1). */
87
+
88
+ curr_row[col] = prev_row[col-1] + (RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset+col-1)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(offset+row-1)))) ? 0 : 1);
89
+
90
+ /* Insertion if it's cheaper than substitution. */
91
+
92
+ if (prev_row[col]+1 < curr_row[col]) {
93
+ curr_row[col] = prev_row[col]+1;
94
+ }
95
+
96
+ /* Deletion if it's cheaper than substitution. */
97
+
98
+ if (curr_row[col-1]+1 < curr_row[col]) {
99
+ curr_row[col] = curr_row[col-1]+1;
100
+ }
101
+
102
+ /* Keep track of the minimum value on this row. */
103
+
104
+ if (curr_row[col] < curr_row_min) {
105
+ curr_row_min = curr_row[col];
106
+ }
107
+ }
108
+
109
+ /* Return nil as soon as we exceed the threshold. */
110
+
111
+ if (threshold > -1 && curr_row_min >= threshold) {
112
+ free(prev_row);
113
+ free(curr_row);
114
+
115
+ return Qnil;
116
+ }
117
+ }
118
+
119
+ /* The result is the last value on the last row. */
120
+
121
+ result = curr_row[l1];
122
+
123
+ free(prev_row);
124
+ free(curr_row);
125
+
126
+ /* Return the Ruby version of the result. */
127
+
128
+ return INT2FIX(result);
129
+ }
@@ -0,0 +1,133 @@
1
+ #include "ruby.h"
2
+
3
+ VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
+ int threshold;
5
+ int l1, l2;
6
+ int *prev_row, *curr_row;
7
+ int col, row;
8
+ int curr_row_min, result;
9
+ int offset;
10
+ char *s1, *s2;
11
+
12
+ /* Convert Ruby's s1 to C's s1. */
13
+
14
+ rb_o1 = StringValue(rb_o1);
15
+ s1 = RSTRING_PTR(rb_o1);
16
+ l1 = RSTRING_LEN(rb_o1);
17
+
18
+ /* Convert Ruby's s2 to C's s2. */
19
+
20
+ rb_o2 = StringValue(rb_o2);
21
+ s2 = RSTRING_PTR(rb_o2);
22
+ l2 = RSTRING_LEN(rb_o2);
23
+
24
+ /* Convert Ruby's threshold to C's threshold. */
25
+
26
+ if (!NIL_P(rb_threshold)) {
27
+ threshold = FIX2INT(rb_threshold);
28
+ } else {
29
+ threshold = -1;
30
+ }
31
+
32
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
33
+
34
+ offset = 0;
35
+ while (s1[offset] == s2[offset]) {
36
+ offset++;
37
+ }
38
+
39
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
40
+
41
+ while ((l1-1 > offset) && (l2-1 > offset) && (s1[l1-1] == s2[l2-1])) {
42
+ l1--;
43
+ l2--;
44
+ }
45
+
46
+ l1 -= offset;
47
+ l2 -= offset;
48
+
49
+ /* The Levenshtein algorithm itself. */
50
+
51
+ /* s1= */
52
+ /* ERIK */
53
+ /* */
54
+ /* 01234 */
55
+ /* s2=V 11234 */
56
+ /* E 21234 */
57
+ /* E 32234 */
58
+ /* N 43334 <- prev_row */
59
+ /* S 54444 <- curr_row */
60
+ /* T 65555 */
61
+ /* R 76566 */
62
+ /* A 87667 */
63
+
64
+ /* Allocate memory for both rows */
65
+
66
+ prev_row = ALLOC_N(int, l1+1);
67
+ curr_row = ALLOC_N(int, l1+1);
68
+
69
+ if ((prev_row == NULL) || (curr_row == NULL)) {
70
+ rb_raise(rb_eNoMemError, "out of memory");
71
+ }
72
+
73
+ /* Initialize the current row. */
74
+
75
+ for (col=0; col<=l1; col++) {
76
+ curr_row[col] = col;
77
+ }
78
+
79
+ for (row=1; row<=l2; row++) {
80
+ /* Copy the current row to the previous row. */
81
+
82
+ memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
83
+
84
+ /* Calculate the values of the current row. */
85
+
86
+ curr_row[0] = row;
87
+ curr_row_min = row;
88
+
89
+ for (col=1; col<=l1; col++) {
90
+ /* Equal (cost=0) or substitution (cost=1). */
91
+
92
+ curr_row[col] = prev_row[col-1] + ((s1[offset+col-1] == s2[offset+row-1]) ? 0 : 1);
93
+
94
+ /* Insertion if it's cheaper than substitution. */
95
+
96
+ if (prev_row[col]+1 < curr_row[col]) {
97
+ curr_row[col] = prev_row[col]+1;
98
+ }
99
+
100
+ /* Deletion if it's cheaper than substitution. */
101
+
102
+ if (curr_row[col-1]+1 < curr_row[col]) {
103
+ curr_row[col] = curr_row[col-1]+1;
104
+ }
105
+
106
+ /* Keep track of the minimum value on this row. */
107
+
108
+ if (curr_row[col] < curr_row_min) {
109
+ curr_row_min = curr_row[col];
110
+ }
111
+ }
112
+
113
+ /* Return nil as soon as we exceed the threshold. */
114
+
115
+ if (threshold > -1 && curr_row_min >= threshold) {
116
+ free(prev_row);
117
+ free(curr_row);
118
+
119
+ return Qnil;
120
+ }
121
+ }
122
+
123
+ /* The result is the last value on the last row. */
124
+
125
+ result = curr_row[l1];
126
+
127
+ free(prev_row);
128
+ free(curr_row);
129
+
130
+ /* Return the Ruby version of the result. */
131
+
132
+ return INT2FIX(result);
133
+ }
@@ -0,0 +1,21 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "levenshtein/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "levenshtein-19"
7
+ s.version = Levenshtein::VERSION
8
+ s.authors = ["Erik Veenstra", "Ryan Fitzgerald"]
9
+ s.email = ["rwfitzge@gmail.com"]
10
+ s.homepage = "http://github.com/rwfitzge/levenshtein-19"
11
+ s.summary = %q{Calculates the Levenshtein distance between two byte strings.}
12
+ s.description = %q{Calculates the Levenshtein distance between two byte strings.}
13
+
14
+ s.rubyforge_project = "levenshtein-19"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+ s.extensions = ["ext/levenshtein/extconf.rb"]
21
+ end
@@ -0,0 +1,3 @@
1
+ module Levenshtein
2
+ VERSION = "0.3.0"
3
+ end
@@ -0,0 +1,109 @@
1
+ require "levenshtein/version"
2
+
3
+ begin
4
+ require "levenshtein/levenshtein_fast" # If compiled by RubyGems.
5
+ rescue LoadError
6
+ begin
7
+ require "levenshtein_fast" # If compiled by the build script.
8
+ rescue LoadError
9
+ $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.distance. Using the much slower Ruby version instead."
10
+ end
11
+ end
12
+
13
+ # The Levenshtein distance is a metric for measuring the amount
14
+ # of difference between two sequences (i.e., the so called edit
15
+ # distance). The Levenshtein distance between two sequences is
16
+ # given by the minimum number of operations needed to transform
17
+ # one sequence into the other, where an operation is an
18
+ # insertion, deletion, or substitution of a single element.
19
+ #
20
+ # More information about the Levenshtein distance algorithm:
21
+ # http://en.wikipedia.org/wiki/Levenshtein_distance .
22
+
23
+ module Levenshtein
24
+ # Returns the Levenshtein distance as a number between 0.0 and
25
+ # 1.0. It's basically the Levenshtein distance divided by the
26
+ # length of the longest sequence.
27
+
28
+ def self.normalized_distance(s1, s2, threshold=nil)
29
+ s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
30
+
31
+ if s2.length == 0
32
+ 0.0 # Since s1.length < s2.length, s1 must be empty as well.
33
+ else
34
+ if threshold
35
+ if d = self.distance(s1, s2, (threshold*s2.length+1).to_i)
36
+ d.to_f/s2.length
37
+ else
38
+ nil
39
+ end
40
+ else
41
+ self.distance(s1, s2).to_f/s2.length
42
+ end
43
+ end
44
+ end
45
+
46
+ # Returns the Levenshtein distance between two sequences.
47
+ #
48
+ # The two sequences can be two strings, two arrays, or two other
49
+ # objects. Strings, arrays and arrays of strings are handled with
50
+ # optimized (very fast) C code. All other sequences are handled
51
+ # with generic (fast) C code.
52
+ #
53
+ # The sequences should respond to :length and :[] and all objects
54
+ # in the sequences (as returned by []) should response to :==.
55
+
56
+ def self.distance(s1, s2, threshold=nil)
57
+ s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
58
+
59
+ # Handle some basic circumstances.
60
+
61
+ return 0 if s1 == s2
62
+ return s2.length if s1.length == 0
63
+
64
+ if threshold
65
+ return nil if (s2.length-s1.length) >= threshold
66
+
67
+ a1, a2 = nil, nil
68
+ a1, a2 = s1, s2 if s1.respond_to?(:-) and s2.respond_to?(:-)
69
+ a1, a2 = s1.scan(/./), s2.scan(/./) if s1.respond_to?(:scan) and s2.respond_to?(:scan)
70
+
71
+ if a1 and a2
72
+ return nil if (a1-a2).length >= threshold
73
+ return nil if (a2-a1).length >= threshold
74
+ end
75
+ end
76
+
77
+ distance_fast_or_slow(s1, s2, threshold)
78
+ end
79
+
80
+ def self.distance_fast_or_slow(s1, s2, threshold) # :nodoc:
81
+ if respond_to?(:levenshtein_distance_fast)
82
+ levenshtein_distance_fast(s1, s2, threshold) # Implemented in C.
83
+ else
84
+ levenshtein_distance_slow(s1, s2, threshold) # Implemented in Ruby.
85
+ end
86
+ end
87
+
88
+ def self.levenshtein_distance_slow(s1, s2, threshold) # :nodoc:
89
+ row = (0..s1.length).to_a
90
+
91
+ 1.upto(s2.length) do |y|
92
+ prow = row
93
+ row = [y]
94
+
95
+ 1.upto(s1.length) do |x|
96
+ row[x] = [prow[x]+1, row[x-1]+1, prow[x-1]+(s1[x-1]==s2[y-1] ? 0 : 1)].min
97
+ end
98
+
99
+ # Stop analysing this sequence as soon as the best possible
100
+ # result for this sequence is bigger than the best result so far.
101
+ # (The minimum value in the next row will be equal to or greater
102
+ # than the minimum value in this row.)
103
+
104
+ return nil if threshold and row.min >= threshold
105
+ end
106
+
107
+ row[-1]
108
+ end
109
+ end
data/test/test.rb ADDED
@@ -0,0 +1,125 @@
1
+ require "test/unit"
2
+ require "levenshtein"
3
+
4
+ module Levenshtein
5
+ class TestSequence
6
+ def initialize(o)
7
+ @sequence = o
8
+ end
9
+
10
+ def length
11
+ @sequence.length
12
+ end
13
+
14
+ def [](pos)
15
+ @sequence[pos]
16
+ end
17
+ end
18
+
19
+ class TestElement
20
+ attr_reader :object
21
+
22
+ def initialize(o)
23
+ @object = o
24
+ end
25
+
26
+ def ==(other)
27
+ @object == other.object
28
+ end
29
+ end
30
+ end
31
+
32
+ class TestLevenshteinString < Test::Unit::TestCase
33
+ def test_erik_veenstra
34
+ assert_equal(7, Levenshtein.distance("erik", "veenstra"))
35
+ assert_equal(7, Levenshtein.distance("veenstra", "erik"))
36
+
37
+ assert_in_delta(0.875, Levenshtein.normalized_distance("erik", "veenstra"), 0.01)
38
+ assert_in_delta(0.875, Levenshtein.normalized_distance("veenstra", "erik"), 0.01)
39
+ end
40
+
41
+ def test_empty_string
42
+ assert_equal(0, Levenshtein.distance("", ""))
43
+ assert_equal(3, Levenshtein.distance("", "foo"))
44
+ assert_equal(3, Levenshtein.distance("foo", ""))
45
+
46
+ assert_in_delta(0.0, Levenshtein.normalized_distance("", ""), 0.01)
47
+ assert_in_delta(1.0, Levenshtein.normalized_distance("", "foo"), 0.01)
48
+ assert_in_delta(1.0, Levenshtein.normalized_distance("foo", ""), 0.01)
49
+ end
50
+
51
+ def test_same_string
52
+ assert_equal(0, Levenshtein.distance("", ""))
53
+ assert_equal(0, Levenshtein.distance("foo", "foo"))
54
+
55
+ assert_in_delta(0.0, Levenshtein.normalized_distance("", ""), 0.01)
56
+ assert_in_delta(0.0, Levenshtein.normalized_distance("foo", "foo"), 0.01)
57
+ end
58
+
59
+ def test_threshold
60
+ assert_equal(3, Levenshtein.distance("foo", "foobar"))
61
+ assert_equal(3, Levenshtein.distance("foo", "foobar", 4))
62
+ assert_equal(nil, Levenshtein.distance("foo", "foobar", 2))
63
+
64
+ assert_in_delta(0.5, Levenshtein.normalized_distance("foo", "foobar"), 0.01)
65
+ assert_in_delta(0.5, Levenshtein.normalized_distance("foo", "foobar", 0.66), 0.01)
66
+ assert_equal(nil, Levenshtein.normalized_distance("foo", "foobar", 0.30))
67
+ end
68
+
69
+ def test_same_head_and_or_tail
70
+ assert_equal(3, Levenshtein.distance("ab123cd", "abxyzcd"))
71
+ assert_equal(3, Levenshtein.distance("ab123", "abxyz"))
72
+ assert_equal(3, Levenshtein.distance("123cd", "xyzcd"))
73
+ assert_equal(5, Levenshtein.distance("123cd123", "123"))
74
+
75
+ assert_in_delta(0.42, Levenshtein.normalized_distance("ab123cd", "abxyzcd"), 0.01)
76
+ assert_in_delta(0.6, Levenshtein.normalized_distance("ab123", "abxyz"), 0.01)
77
+ assert_in_delta(0.6, Levenshtein.normalized_distance("123cd", "xyzcd"), 0.01)
78
+ assert_in_delta(0.625, Levenshtein.normalized_distance("123cd123", "123"), 0.01)
79
+ end
80
+ end
81
+
82
+ class TestLevenshteinArray < Test::Unit::TestCase
83
+ def test_erik_veenstra
84
+ x = lambda{|s| s.scan(/./).collect{|e| Levenshtein::TestElement.new(e)}}
85
+
86
+ assert_equal(7, Levenshtein.distance(x["erik"], x["veenstra"]))
87
+ end
88
+ end
89
+
90
+ class TestLevenshteinArrayOfStrings < Test::Unit::TestCase
91
+ def test_erik_veenstra
92
+ x = lambda{|s| s.scan(/./)}
93
+
94
+ assert_equal(7, Levenshtein.distance(x["erik"], x["veenstra"]))
95
+ end
96
+ end
97
+
98
+ class TestLevenshteinGeneric < Test::Unit::TestCase
99
+ def test_erik_veenstra
100
+ x = lambda{|s| Levenshtein::TestSequence.new(s.scan(/./).collect{|e| Levenshtein::TestElement.new(e)})}
101
+
102
+ assert_equal(7, Levenshtein.distance(x["erik"], x["veenstra"]))
103
+ end
104
+ end
105
+
106
+ class TestLevenshteinSlow < Test::Unit::TestCase
107
+ def test_erik_veenstra
108
+ assert_equal(7, Levenshtein.levenshtein_distance_slow("erik", "veenstra", nil))
109
+ end
110
+
111
+ def test_empty_sequence
112
+ assert_equal(0, Levenshtein.levenshtein_distance_slow("", "", nil))
113
+ assert_equal(3, Levenshtein.levenshtein_distance_slow("", "foo", nil))
114
+ end
115
+
116
+ def test_same_sequence
117
+ assert_equal(0, Levenshtein.levenshtein_distance_slow("", "", nil))
118
+ assert_equal(0, Levenshtein.levenshtein_distance_slow("foo", "foo", nil))
119
+ end
120
+
121
+ def test_threshold
122
+ assert_equal(3, Levenshtein.levenshtein_distance_slow("foo", "foobar", nil))
123
+ assert_equal(nil, Levenshtein.levenshtein_distance_slow("foo", "foobar", 2))
124
+ end
125
+ end
metadata ADDED
@@ -0,0 +1,64 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: levenshtein-19
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.3.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Erik Veenstra
9
+ - Ryan Fitzgerald
10
+ autorequire:
11
+ bindir: bin
12
+ cert_chain: []
13
+ date: 2011-11-21 00:00:00.000000000Z
14
+ dependencies: []
15
+ description: Calculates the Levenshtein distance between two byte strings.
16
+ email:
17
+ - rwfitzge@gmail.com
18
+ executables: []
19
+ extensions:
20
+ - ext/levenshtein/extconf.rb
21
+ extra_rdoc_files: []
22
+ files:
23
+ - .gitignore
24
+ - CHANGELOG
25
+ - Gemfile
26
+ - LICENSE
27
+ - README
28
+ - Rakefile
29
+ - ext/levenshtein/extconf.rb
30
+ - ext/levenshtein/levenshtein_array.c
31
+ - ext/levenshtein/levenshtein_array_of_strings.c
32
+ - ext/levenshtein/levenshtein_fast.c
33
+ - ext/levenshtein/levenshtein_generic.c
34
+ - ext/levenshtein/levenshtein_string.c
35
+ - levenshtein-19.gemspec
36
+ - lib/levenshtein.rb
37
+ - lib/levenshtein/version.rb
38
+ - test/test.rb
39
+ homepage: http://github.com/rwfitzge/levenshtein-19
40
+ licenses: []
41
+ post_install_message:
42
+ rdoc_options: []
43
+ require_paths:
44
+ - lib
45
+ required_ruby_version: !ruby/object:Gem::Requirement
46
+ none: false
47
+ requirements:
48
+ - - ! '>='
49
+ - !ruby/object:Gem::Version
50
+ version: '0'
51
+ required_rubygems_version: !ruby/object:Gem::Requirement
52
+ none: false
53
+ requirements:
54
+ - - ! '>='
55
+ - !ruby/object:Gem::Version
56
+ version: '0'
57
+ requirements: []
58
+ rubyforge_project: levenshtein-19
59
+ rubygems_version: 1.8.11
60
+ signing_key:
61
+ specification_version: 3
62
+ summary: Calculates the Levenshtein distance between two byte strings.
63
+ test_files:
64
+ - test/test.rb