levenshtein-extended 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,7 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
5
+ */.DS_Store
6
+ tmp/*
7
+ */.DS_Store
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in levenshtein.gemspec
4
+ gemspec
data/README ADDED
@@ -0,0 +1,14 @@
1
+ The levenshtein module implements fast Damerau-Levenshtein edit distance computation in O(n) memory and O(n^2) time, using a C wrapper.
2
+
3
+ USAGE:
4
+
5
+ @install
6
+ git clone git://github.com/esdras/levenshtein.git
7
+ cd levenshtein
8
+ rake compile
9
+ rake install
10
+
11
+ @usage
12
+ require 'levenshtein'
13
+ Levenshtein.normalized_distance("string 1", "string 2")
14
+ Levenshtein.normalized_distance([2, 3, 4, 5], [1, 2, 3, 4])
data/Rakefile ADDED
@@ -0,0 +1,3 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rake/extensiontask'
3
+ Rake::ExtensionTask.new('levenshtein_in_c')
@@ -0,0 +1,10 @@
1
+ require "mkmf"
2
+
3
+ dir_config("levenshtein_in_c")
4
+
5
+ have_library("levenshtein_array")
6
+ have_library("levenshtein_array_of_strings")
7
+ have_library("levenshtein_generic")
8
+ have_library("levenshtein_string")
9
+
10
+ create_makefile("levenshtein_in_c/levenshtein_in_c")
@@ -0,0 +1,127 @@
1
+ #include "ruby.h"
2
+
3
+ VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
+ int threshold;
5
+ long l1, l2;
6
+ int *prev_row, *curr_row;
7
+ int col, row;
8
+ int curr_row_min, result;
9
+ int offset;
10
+
11
+ ID id_eql = rb_intern("==");
12
+
13
+ /* Get the sizes of both arrays. */
14
+
15
+ l1 = RARRAY_LEN(RARRAY(rb_o1));
16
+ l2 = RARRAY_LEN(RARRAY(rb_o2));
17
+
18
+ /* Convert Ruby's threshold to C's threshold. */
19
+
20
+ if (!NIL_P(rb_threshold)) {
21
+ threshold = FIX2INT(rb_threshold);
22
+ } else {
23
+ threshold = -1;
24
+ }
25
+
26
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
27
+
28
+ offset = 0;
29
+ while RTEST(rb_funcall(rb_ary_entry(rb_o1, offset), id_eql, 1, rb_ary_entry(rb_o2, offset))) {
30
+ offset++;
31
+ }
32
+
33
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
34
+
35
+ while ((l1-1 > offset) && (l2-1 > offset) && RTEST(rb_funcall(rb_ary_entry(rb_o1, l1-1), id_eql, 1, rb_ary_entry(rb_o2, l2-1)))) {
36
+ l1--;
37
+ l2--;
38
+ }
39
+
40
+ l1 -= offset;
41
+ l2 -= offset;
42
+
43
+ /* The Levenshtein algorithm itself. */
44
+
45
+ /* s1= */
46
+ /* ERIK */
47
+ /* */
48
+ /* 01234 */
49
+ /* s2=V 11234 */
50
+ /* E 21234 */
51
+ /* E 32234 */
52
+ /* N 43334 <- prev_row */
53
+ /* S 54444 <- curr_row */
54
+ /* T 65555 */
55
+ /* R 76566 */
56
+ /* A 87667 */
57
+
58
+ /* Allocate memory for both rows */
59
+
60
+ prev_row = ALLOC_N(int, l1+1);
61
+ curr_row = ALLOC_N(int, l1+1);
62
+
63
+ if ((prev_row == NULL) || (curr_row == NULL)) {
64
+ rb_raise(rb_eNoMemError, "out of memory");
65
+ }
66
+
67
+ /* Initialize the current row. */
68
+
69
+ for (col=0; col<=l1; col++) {
70
+ curr_row[col] = col;
71
+ }
72
+
73
+ for (row=1; row<=l2; row++) {
74
+ /* Copy the current row to the previous row. */
75
+
76
+ memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
77
+
78
+ /* Calculate the values of the current row. */
79
+
80
+ curr_row[0] = row;
81
+ curr_row_min = row;
82
+
83
+ for (col=1; col<=l1; col++) {
84
+ /* Equal (cost=0) or substitution (cost=1). */
85
+
86
+ curr_row[col] = prev_row[col-1] + (RTEST(rb_funcall(rb_ary_entry(rb_o1, offset+col-1), id_eql, 1, rb_ary_entry(rb_o2, offset+row-1))) ? 0 : 1);
87
+
88
+ /* Insertion if it's cheaper than substitution. */
89
+
90
+ if (prev_row[col]+1 < curr_row[col]) {
91
+ curr_row[col] = prev_row[col]+1;
92
+ }
93
+
94
+ /* Deletion if it's cheaper than substitution. */
95
+
96
+ if (curr_row[col-1]+1 < curr_row[col]) {
97
+ curr_row[col] = curr_row[col-1]+1;
98
+ }
99
+
100
+ /* Keep track of the minimum value on this row. */
101
+
102
+ if (curr_row[col] < curr_row_min) {
103
+ curr_row_min = curr_row[col];
104
+ }
105
+ }
106
+
107
+ /* Return nil as soon as we exceed the threshold. */
108
+
109
+ if (threshold > -1 && curr_row_min >= threshold) {
110
+ free(prev_row);
111
+ free(curr_row);
112
+
113
+ return Qnil;
114
+ }
115
+ }
116
+
117
+ /* The result is the last value on the last row. */
118
+
119
+ result = curr_row[l1];
120
+
121
+ free(prev_row);
122
+ free(curr_row);
123
+
124
+ /* Return the Ruby version of the result. */
125
+
126
+ return INT2FIX(result);
127
+ }
@@ -0,0 +1,125 @@
1
+ #include "ruby.h"
2
+
3
+ VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
+ int threshold;
5
+ long l1, l2;
6
+ int *prev_row, *curr_row;
7
+ int col, row;
8
+ int curr_row_min, result;
9
+ int offset;
10
+
11
+ /* Get the sizes of both arrays. */
12
+
13
+ l1 = RARRAY_LEN(RARRAY(rb_o1));
14
+ l2 = RARRAY_LEN(RARRAY(rb_o2));
15
+
16
+ /* Convert Ruby's threshold to C's threshold. */
17
+
18
+ if (!NIL_P(rb_threshold)) {
19
+ threshold = FIX2INT(rb_threshold);
20
+ } else {
21
+ threshold = -1;
22
+ }
23
+
24
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
25
+
26
+ offset = 0;
27
+ while (rb_str_cmp(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)) == 0) {
28
+ offset++;
29
+ }
30
+
31
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
32
+
33
+ while ((l1-1 > offset) && (l2-1 > offset) && (rb_str_cmp(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)) == 0 )) {
34
+ l1--;
35
+ l2--;
36
+ }
37
+
38
+ l1 -= offset;
39
+ l2 -= offset;
40
+
41
+ /* The Levenshtein algorithm itself. */
42
+
43
+ /* s1= */
44
+ /* ERIK */
45
+ /* */
46
+ /* 01234 */
47
+ /* s2=V 11234 */
48
+ /* E 21234 */
49
+ /* E 32234 */
50
+ /* N 43334 <- prev_row */
51
+ /* S 54444 <- curr_row */
52
+ /* T 65555 */
53
+ /* R 76566 */
54
+ /* A 87667 */
55
+
56
+ /* Allocate memory for both rows */
57
+
58
+ prev_row = ALLOC_N(int, l1+1);
59
+ curr_row = ALLOC_N(int, l1+1);
60
+
61
+ if ((prev_row == NULL) || (curr_row == NULL)) {
62
+ rb_raise(rb_eNoMemError, "out of memory");
63
+ }
64
+
65
+ /* Initialize the current row. */
66
+
67
+ for (col=0; col<=l1; col++) {
68
+ curr_row[col] = col;
69
+ }
70
+
71
+ for (row=1; row<=l2; row++) {
72
+ /* Copy the current row to the previous row. */
73
+
74
+ memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
75
+
76
+ /* Calculate the values of the current row. */
77
+
78
+ curr_row[0] = row;
79
+ curr_row_min = row;
80
+
81
+ for (col=1; col<=l1; col++) {
82
+ /* Equal (cost=0) or substitution (cost=1). */
83
+
84
+ curr_row[col] = prev_row[col-1] + ((rb_str_cmp(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1)) == 0) ? 0 : 1);
85
+
86
+ /* Insertion if it's cheaper than substitution. */
87
+
88
+ if (prev_row[col]+1 < curr_row[col]) {
89
+ curr_row[col] = prev_row[col]+1;
90
+ }
91
+
92
+ /* Deletion if it's cheaper than substitution. */
93
+
94
+ if (curr_row[col-1]+1 < curr_row[col]) {
95
+ curr_row[col] = curr_row[col-1]+1;
96
+ }
97
+
98
+ /* Keep track of the minimum value on this row. */
99
+
100
+ if (curr_row[col] < curr_row_min) {
101
+ curr_row_min = curr_row[col];
102
+ }
103
+ }
104
+
105
+ /* Return nil as soon as we exceed the threshold. */
106
+
107
+ if (threshold > -1 && curr_row_min >= threshold) {
108
+ free(prev_row);
109
+ free(curr_row);
110
+
111
+ return Qnil;
112
+ }
113
+ }
114
+
115
+ /* The result is the last value on the last row. */
116
+
117
+ result = curr_row[l1];
118
+
119
+ free(prev_row);
120
+ free(curr_row);
121
+
122
+ /* Return the Ruby version of the result. */
123
+
124
+ return INT2FIX(result);
125
+ }
@@ -0,0 +1,21 @@
1
+ #include "ruby.h"
2
+
3
+ VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
+ if ((TYPE(rb_o1) == T_STRING) && (TYPE(rb_o2)) == T_STRING) {
5
+ return levenshtein_distance_string(self, rb_o1, rb_o2, rb_threshold);
6
+ } else if ((TYPE(rb_o1) == T_ARRAY) && (TYPE(rb_o2)) == T_ARRAY) {
7
+ if ((TYPE(rb_ary_entry(rb_o1, 0)) == T_STRING) && (TYPE(rb_ary_entry(rb_o2, 0))) == T_STRING) {
8
+ return levenshtein_distance_array_of_strings(self, rb_o1, rb_o2, rb_threshold);
9
+ } else {
10
+ return levenshtein_distance_array(self, rb_o1, rb_o2, rb_threshold);
11
+ }
12
+ } else {
13
+ return levenshtein_distance_generic(self, rb_o1, rb_o2, rb_threshold);
14
+ }
15
+ }
16
+
17
+ void Init_levenshtein_in_c() {
18
+ VALUE mLevenshtein = rb_define_module("Levenshtein");
19
+
20
+ rb_define_singleton_method(mLevenshtein, "levenshtein_distance_fast" , levenshtein_distance_fast, 3);
21
+ }
@@ -0,0 +1,129 @@
1
+ #include "ruby.h"
2
+
3
+ VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
+ int threshold;
5
+ int l1, l2;
6
+ int *prev_row, *curr_row;
7
+ int col, row;
8
+ int curr_row_min, result;
9
+ int offset;
10
+
11
+ ID id_length = rb_intern("length");
12
+ ID id_get = rb_intern("[]");
13
+ ID id_equal = rb_intern("==");
14
+
15
+ /* Get the sizes of both sequences. */
16
+
17
+ l1 = FIX2INT(rb_funcall(rb_o1, id_length, 0));
18
+ l2 = FIX2INT(rb_funcall(rb_o2, id_length, 0));
19
+
20
+ /* Convert Ruby's threshold to C's threshold. */
21
+
22
+ if (!NIL_P(rb_threshold)) {
23
+ threshold = FIX2INT(rb_threshold);
24
+ } else {
25
+ threshold = -1;
26
+ }
27
+
28
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
29
+
30
+ offset = 0;
31
+ while RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(offset)))) {
32
+ offset++;
33
+ }
34
+
35
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
36
+
37
+ while ((l1-1 > offset) && (l2-1 > offset) && RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(l1-1)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(l2-1))))) {
38
+ l1--;
39
+ l2--;
40
+ }
41
+
42
+ l1 -= offset;
43
+ l2 -= offset;
44
+
45
+ /* The Levenshtein algorithm itself. */
46
+
47
+ /* s1= */
48
+ /* ERIK */
49
+ /* */
50
+ /* 01234 */
51
+ /* s2=V 11234 */
52
+ /* E 21234 */
53
+ /* E 32234 */
54
+ /* N 43334 <- prev_row */
55
+ /* S 54444 <- curr_row */
56
+ /* T 65555 */
57
+ /* R 76566 */
58
+ /* A 87667 */
59
+
60
+ /* Allocate memory for both rows */
61
+
62
+ prev_row = ALLOC_N(int, l1+1);
63
+ curr_row = ALLOC_N(int, l1+1);
64
+
65
+ if ((prev_row == NULL) || (curr_row == NULL)) {
66
+ rb_raise(rb_eNoMemError, "out of memory");
67
+ }
68
+
69
+ /* Initialize the current row. */
70
+
71
+ for (col=0; col<=l1; col++) {
72
+ curr_row[col] = col;
73
+ }
74
+
75
+ for (row=1; row<=l2; row++) {
76
+ /* Copy the current row to the previous row. */
77
+
78
+ memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
79
+
80
+ /* Calculate the values of the current row. */
81
+
82
+ curr_row[0] = row;
83
+ curr_row_min = row;
84
+
85
+ for (col=1; col<=l1; col++) {
86
+ /* Equal (cost=0) or substitution (cost=1). */
87
+
88
+ curr_row[col] = prev_row[col-1] + (RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset+col-1)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(offset+row-1)))) ? 0 : 1);
89
+
90
+ /* Insertion if it's cheaper than substitution. */
91
+
92
+ if (prev_row[col]+1 < curr_row[col]) {
93
+ curr_row[col] = prev_row[col]+1;
94
+ }
95
+
96
+ /* Deletion if it's cheaper than substitution. */
97
+
98
+ if (curr_row[col-1]+1 < curr_row[col]) {
99
+ curr_row[col] = curr_row[col-1]+1;
100
+ }
101
+
102
+ /* Keep track of the minimum value on this row. */
103
+
104
+ if (curr_row[col] < curr_row_min) {
105
+ curr_row_min = curr_row[col];
106
+ }
107
+ }
108
+
109
+ /* Return nil as soon as we exceed the threshold. */
110
+
111
+ if (threshold > -1 && curr_row_min >= threshold) {
112
+ free(prev_row);
113
+ free(curr_row);
114
+
115
+ return Qnil;
116
+ }
117
+ }
118
+
119
+ /* The result is the last value on the last row. */
120
+
121
+ result = curr_row[l1];
122
+
123
+ free(prev_row);
124
+ free(curr_row);
125
+
126
+ /* Return the Ruby version of the result. */
127
+
128
+ return INT2FIX(result);
129
+ }
@@ -0,0 +1,133 @@
1
+ #include "ruby.h"
2
+
3
+ VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
+ int threshold;
5
+ long l1, l2;
6
+ int *prev_row, *curr_row;
7
+ int col, row;
8
+ int curr_row_min, result;
9
+ int offset;
10
+ char *s1, *s2;
11
+
12
+ /* Convert Ruby's s1 to C's s1. */
13
+
14
+ rb_o1 = StringValue(rb_o1);
15
+ s1 = RSTRING_PTR(RSTRING(rb_o1));
16
+ l1 = RSTRING_LEN(RSTRING(rb_o1));
17
+
18
+ /* Convert Ruby's s2 to C's s2. */
19
+
20
+ rb_o2 = StringValue(rb_o2);
21
+ s2 = RSTRING_PTR(RSTRING(rb_o2));
22
+ l2 = RSTRING_LEN(RSTRING(rb_o2));
23
+
24
+ /* Convert Ruby's threshold to C's threshold. */
25
+
26
+ if (!NIL_P(rb_threshold)) {
27
+ threshold = FIX2INT(rb_threshold);
28
+ } else {
29
+ threshold = -1;
30
+ }
31
+
32
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
33
+
34
+ offset = 0;
35
+ while (s1[offset] == s2[offset]) {
36
+ offset++;
37
+ }
38
+
39
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
40
+
41
+ while ((l1-1 > offset) && (l2-1 > offset) && (s1[l1-1] == s2[l2-1])) {
42
+ l1--;
43
+ l2--;
44
+ }
45
+
46
+ l1 -= offset;
47
+ l2 -= offset;
48
+
49
+ /* The Levenshtein algorithm itself. */
50
+
51
+ /* s1= */
52
+ /* ERIK */
53
+ /* */
54
+ /* 01234 */
55
+ /* s2=V 11234 */
56
+ /* E 21234 */
57
+ /* E 32234 */
58
+ /* N 43334 <- prev_row */
59
+ /* S 54444 <- curr_row */
60
+ /* T 65555 */
61
+ /* R 76566 */
62
+ /* A 87667 */
63
+
64
+ /* Allocate memory for both rows */
65
+
66
+ prev_row = ALLOC_N(int, l1+1);
67
+ curr_row = ALLOC_N(int, l1+1);
68
+
69
+ if ((prev_row == NULL) || (curr_row == NULL)) {
70
+ rb_raise(rb_eNoMemError, "out of memory");
71
+ }
72
+
73
+ /* Initialize the current row. */
74
+
75
+ for (col=0; col<=l1; col++) {
76
+ curr_row[col] = col;
77
+ }
78
+
79
+ for (row=1; row<=l2; row++) {
80
+ /* Copy the current row to the previous row. */
81
+
82
+ memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
83
+
84
+ /* Calculate the values of the current row. */
85
+
86
+ curr_row[0] = row;
87
+ curr_row_min = row;
88
+
89
+ for (col=1; col<=l1; col++) {
90
+ /* Equal (cost=0) or substitution (cost=1). */
91
+
92
+ curr_row[col] = prev_row[col-1] + ((s1[offset+col-1] == s2[offset+row-1]) ? 0 : 1);
93
+
94
+ /* Insertion if it's cheaper than substitution. */
95
+
96
+ if (prev_row[col]+1 < curr_row[col]) {
97
+ curr_row[col] = prev_row[col]+1;
98
+ }
99
+
100
+ /* Deletion if it's cheaper than substitution. */
101
+
102
+ if (curr_row[col-1]+1 < curr_row[col]) {
103
+ curr_row[col] = curr_row[col-1]+1;
104
+ }
105
+
106
+ /* Keep track of the minimum value on this row. */
107
+
108
+ if (curr_row[col] < curr_row_min) {
109
+ curr_row_min = curr_row[col];
110
+ }
111
+ }
112
+
113
+ /* Return nil as soon as we exceed the threshold. */
114
+
115
+ if (threshold > -1 && curr_row_min >= threshold) {
116
+ free(prev_row);
117
+ free(curr_row);
118
+
119
+ return Qnil;
120
+ }
121
+ }
122
+
123
+ /* The result is the last value on the last row. */
124
+
125
+ result = curr_row[l1];
126
+
127
+ free(prev_row);
128
+ free(curr_row);
129
+
130
+ /* Return the Ruby version of the result. */
131
+
132
+ return INT2FIX(result);
133
+ }
@@ -0,0 +1,20 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "levenshtein/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "levenshtein-extended"
7
+ s.version = Levenshtein::VERSION
8
+ s.authors = ["Esdras Mayrink"]
9
+ s.email = ["falecom@oesdras.com.br"]
10
+ s.homepage = ""
11
+ s.summary = "fast string edit distance computation, using the Damerau-Levenshtein algorithm"
12
+ s.description = "The levenshtein module implements fast Damerau-Levenshtein edit distance computation in O(n) memory and O(n^2) time, using a C wrapper."
13
+
14
+ s.rubyforge_project = "levenshtein"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+ end
@@ -0,0 +1,108 @@
1
+ begin
2
+ require "levenshtein/levenshtein_in_c" # If compiled by RubyGems.
3
+ rescue LoadError
4
+ begin
5
+ require "levenshtein_in_c" # If compiled by the build script.
6
+ rescue LoadError
7
+ $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.distance. Using the much slower Ruby version instead."
8
+ end
9
+ end
10
+
11
+ # The Levenshtein distance is a metric for measuring the amount
12
+ # of difference between two sequences (i.e., the so called edit
13
+ # distance). The Levenshtein distance between two sequences is
14
+ # given by the minimum number of operations needed to transform
15
+ # one sequence into the other, where an operation is an
16
+ # insertion, deletion, or substitution of a single element.
17
+ #
18
+ # More information about the Levenshtein distance algorithm:
19
+ # http://en.wikipedia.org/wiki/Levenshtein_distance .
20
+
21
+ module Levenshtein
22
+
23
+ # Returns the Levenshtein distance as a number between 0.0 and
24
+ # 1.0. It's basically the Levenshtein distance divided by the
25
+ # length of the longest sequence.
26
+
27
+ def self.normalized_distance(s1, s2, threshold=nil)
28
+ s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
29
+
30
+ if s2.length == 0
31
+ 0.0 # Since s1.length < s2.length, s1 must be empty as well.
32
+ else
33
+ if threshold
34
+ if d = self.distance(s1, s2, (threshold*s2.length+1).to_i)
35
+ d.to_f/s2.length
36
+ else
37
+ nil
38
+ end
39
+ else
40
+ self.distance(s1, s2).to_f/s2.length
41
+ end
42
+ end
43
+ end
44
+
45
+ # Returns the Levenshtein distance between two sequences.
46
+ #
47
+ # The two sequences can be two strings, two arrays, or two other
48
+ # objects. Strings, arrays and arrays of strings are handled with
49
+ # optimized (very fast) C code. All other sequences are handled
50
+ # with generic (fast) C code.
51
+ #
52
+ # The sequences should respond to :length and :[] and all objects
53
+ # in the sequences (as returned by []) should response to :==.
54
+
55
+ def self.distance(s1, s2, threshold=nil)
56
+ s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
57
+
58
+ # Handle some basic circumstances.
59
+
60
+ return 0 if s1 == s2
61
+ return s2.length if s1.length == 0
62
+
63
+ if threshold
64
+ return nil if (s2.length-s1.length) >= threshold
65
+
66
+ a1, a2 = nil, nil
67
+ a1, a2 = s1, s2 if s1.respond_to?(:-) and s2.respond_to?(:-)
68
+ a1, a2 = s1.scan(/./), s2.scan(/./) if s1.respond_to?(:scan) and s2.respond_to?(:scan)
69
+
70
+ if a1 and a2
71
+ return nil if (a1-a2).length >= threshold
72
+ return nil if (a2-a1).length >= threshold
73
+ end
74
+ end
75
+
76
+ distance_fast_or_slow(s1, s2, threshold)
77
+ end
78
+
79
+ def self.distance_fast_or_slow(s1, s2, threshold) # :nodoc:
80
+ if respond_to?(:levenshtein_distance_fast)
81
+ levenshtein_distance_fast(s1, s2, threshold) # Implemented in C.
82
+ else
83
+ levenshtein_distance_slow(s1, s2, threshold) # Implemented in Ruby.
84
+ end
85
+ end
86
+
87
+ def self.levenshtein_distance_slow(s1, s2, threshold) # :nodoc:
88
+ row = (0..s1.length).to_a
89
+
90
+ 1.upto(s2.length) do |y|
91
+ prow = row
92
+ row = [y]
93
+
94
+ 1.upto(s1.length) do |x|
95
+ row[x] = [prow[x]+1, row[x-1]+1, prow[x-1]+(s1[x-1]==s2[y-1] ? 0 : 1)].min
96
+ end
97
+
98
+ # Stop analysing this sequence as soon as the best possible
99
+ # result for this sequence is bigger than the best result so far.
100
+ # (The minimum value in the next row will be equal to or greater
101
+ # than the minimum value in this row.)
102
+
103
+ return nil if threshold and row.min >= threshold
104
+ end
105
+
106
+ row[-1]
107
+ end
108
+ end
@@ -0,0 +1,3 @@
1
+ module Levenshtein
2
+ VERSION = "0.0.1"
3
+ end
Binary file
metadata ADDED
@@ -0,0 +1,70 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: levenshtein-extended
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.0.1
6
+ platform: ruby
7
+ authors:
8
+ - Esdras Mayrink
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-07-04 00:00:00 -03:00
14
+ default_executable:
15
+ dependencies: []
16
+
17
+ description: The levenshtein module implements fast Damerau-Levenshtein edit distance computation in O(n) memory and O(n^2) time, using a C wrapper.
18
+ email:
19
+ - falecom@oesdras.com.br
20
+ executables: []
21
+
22
+ extensions: []
23
+
24
+ extra_rdoc_files: []
25
+
26
+ files:
27
+ - .gitignore
28
+ - Gemfile
29
+ - README
30
+ - Rakefile
31
+ - ext/levenshtein_in_c/extconf.rb
32
+ - ext/levenshtein_in_c/levenshtein_array.c
33
+ - ext/levenshtein_in_c/levenshtein_array_of_strings.c
34
+ - ext/levenshtein_in_c/levenshtein_fast.c
35
+ - ext/levenshtein_in_c/levenshtein_generic.c
36
+ - ext/levenshtein_in_c/levenshtein_string.c
37
+ - levenshtein.gemspec
38
+ - lib/levenshtein.rb
39
+ - lib/levenshtein/version.rb
40
+ - lib/levenshtein_in_c.bundle
41
+ has_rdoc: true
42
+ homepage: ""
43
+ licenses: []
44
+
45
+ post_install_message:
46
+ rdoc_options: []
47
+
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ version: "0"
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: "0"
62
+ requirements: []
63
+
64
+ rubyforge_project: levenshtein
65
+ rubygems_version: 1.5.2
66
+ signing_key:
67
+ specification_version: 3
68
+ summary: fast string edit distance computation, using the Damerau-Levenshtein algorithm
69
+ test_files: []
70
+