levenshtein-extended 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,7 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
5
+ */.DS_Store
6
+ tmp/*
7
+ */.DS_Store
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in levenshtein.gemspec
4
+ gemspec
data/README ADDED
@@ -0,0 +1,14 @@
1
+ The levenshtein module implements fast Damerau-Levenshtein edit distance computation in O(n) memory and O(n^2) time, using a C wrapper.
2
+
3
+ USAGE:
4
+
5
+ @install
6
+ git clone git://github.com/esdras/levenshtein.git
7
+ cd levenshtein
8
+ rake compile
9
+ rake install
10
+
11
+ @usage
12
+ require 'levenshtein'
13
+ Levenshtein.normalized_distance("string 1", "string 2")
14
+ Levenshtein.normalized_distance([2, 3, 4, 5], [1, 2, 3, 4])
data/Rakefile ADDED
@@ -0,0 +1,3 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rake/extensiontask'
3
+ Rake::ExtensionTask.new('levenshtein_in_c')
@@ -0,0 +1,10 @@
1
+ require "mkmf"
2
+
3
+ dir_config("levenshtein_in_c")
4
+
5
+ have_library("levenshtein_array")
6
+ have_library("levenshtein_array_of_strings")
7
+ have_library("levenshtein_generic")
8
+ have_library("levenshtein_string")
9
+
10
+ create_makefile("levenshtein_in_c/levenshtein_in_c")
@@ -0,0 +1,127 @@
1
+ #include "ruby.h"
2
+
3
+ VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
+ int threshold;
5
+ long l1, l2;
6
+ int *prev_row, *curr_row;
7
+ int col, row;
8
+ int curr_row_min, result;
9
+ int offset;
10
+
11
+ ID id_eql = rb_intern("==");
12
+
13
+ /* Get the sizes of both arrays. */
14
+
15
+ l1 = RARRAY_LEN(RARRAY(rb_o1));
16
+ l2 = RARRAY_LEN(RARRAY(rb_o2));
17
+
18
+ /* Convert Ruby's threshold to C's threshold. */
19
+
20
+ if (!NIL_P(rb_threshold)) {
21
+ threshold = FIX2INT(rb_threshold);
22
+ } else {
23
+ threshold = -1;
24
+ }
25
+
26
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
27
+
28
+ offset = 0;
29
+ while RTEST(rb_funcall(rb_ary_entry(rb_o1, offset), id_eql, 1, rb_ary_entry(rb_o2, offset))) {
30
+ offset++;
31
+ }
32
+
33
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
34
+
35
+ while ((l1-1 > offset) && (l2-1 > offset) && RTEST(rb_funcall(rb_ary_entry(rb_o1, l1-1), id_eql, 1, rb_ary_entry(rb_o2, l2-1)))) {
36
+ l1--;
37
+ l2--;
38
+ }
39
+
40
+ l1 -= offset;
41
+ l2 -= offset;
42
+
43
+ /* The Levenshtein algorithm itself. */
44
+
45
+ /* s1= */
46
+ /* ERIK */
47
+ /* */
48
+ /* 01234 */
49
+ /* s2=V 11234 */
50
+ /* E 21234 */
51
+ /* E 32234 */
52
+ /* N 43334 <- prev_row */
53
+ /* S 54444 <- curr_row */
54
+ /* T 65555 */
55
+ /* R 76566 */
56
+ /* A 87667 */
57
+
58
+ /* Allocate memory for both rows */
59
+
60
+ prev_row = ALLOC_N(int, l1+1);
61
+ curr_row = ALLOC_N(int, l1+1);
62
+
63
+ if ((prev_row == NULL) || (curr_row == NULL)) {
64
+ rb_raise(rb_eNoMemError, "out of memory");
65
+ }
66
+
67
+ /* Initialize the current row. */
68
+
69
+ for (col=0; col<=l1; col++) {
70
+ curr_row[col] = col;
71
+ }
72
+
73
+ for (row=1; row<=l2; row++) {
74
+ /* Copy the current row to the previous row. */
75
+
76
+ memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
77
+
78
+ /* Calculate the values of the current row. */
79
+
80
+ curr_row[0] = row;
81
+ curr_row_min = row;
82
+
83
+ for (col=1; col<=l1; col++) {
84
+ /* Equal (cost=0) or substitution (cost=1). */
85
+
86
+ curr_row[col] = prev_row[col-1] + (RTEST(rb_funcall(rb_ary_entry(rb_o1, offset+col-1), id_eql, 1, rb_ary_entry(rb_o2, offset+row-1))) ? 0 : 1);
87
+
88
+ /* Insertion if it's cheaper than substitution. */
89
+
90
+ if (prev_row[col]+1 < curr_row[col]) {
91
+ curr_row[col] = prev_row[col]+1;
92
+ }
93
+
94
+ /* Deletion if it's cheaper than substitution. */
95
+
96
+ if (curr_row[col-1]+1 < curr_row[col]) {
97
+ curr_row[col] = curr_row[col-1]+1;
98
+ }
99
+
100
+ /* Keep track of the minimum value on this row. */
101
+
102
+ if (curr_row[col] < curr_row_min) {
103
+ curr_row_min = curr_row[col];
104
+ }
105
+ }
106
+
107
+ /* Return nil as soon as we exceed the threshold. */
108
+
109
+ if (threshold > -1 && curr_row_min >= threshold) {
110
+ free(prev_row);
111
+ free(curr_row);
112
+
113
+ return Qnil;
114
+ }
115
+ }
116
+
117
+ /* The result is the last value on the last row. */
118
+
119
+ result = curr_row[l1];
120
+
121
+ free(prev_row);
122
+ free(curr_row);
123
+
124
+ /* Return the Ruby version of the result. */
125
+
126
+ return INT2FIX(result);
127
+ }
@@ -0,0 +1,125 @@
1
+ #include "ruby.h"
2
+
3
+ VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
+ int threshold;
5
+ long l1, l2;
6
+ int *prev_row, *curr_row;
7
+ int col, row;
8
+ int curr_row_min, result;
9
+ int offset;
10
+
11
+ /* Get the sizes of both arrays. */
12
+
13
+ l1 = RARRAY_LEN(RARRAY(rb_o1));
14
+ l2 = RARRAY_LEN(RARRAY(rb_o2));
15
+
16
+ /* Convert Ruby's threshold to C's threshold. */
17
+
18
+ if (!NIL_P(rb_threshold)) {
19
+ threshold = FIX2INT(rb_threshold);
20
+ } else {
21
+ threshold = -1;
22
+ }
23
+
24
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
25
+
26
+ offset = 0;
27
+ while (rb_str_cmp(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)) == 0) {
28
+ offset++;
29
+ }
30
+
31
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
32
+
33
+ while ((l1-1 > offset) && (l2-1 > offset) && (rb_str_cmp(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)) == 0 )) {
34
+ l1--;
35
+ l2--;
36
+ }
37
+
38
+ l1 -= offset;
39
+ l2 -= offset;
40
+
41
+ /* The Levenshtein algorithm itself. */
42
+
43
+ /* s1= */
44
+ /* ERIK */
45
+ /* */
46
+ /* 01234 */
47
+ /* s2=V 11234 */
48
+ /* E 21234 */
49
+ /* E 32234 */
50
+ /* N 43334 <- prev_row */
51
+ /* S 54444 <- curr_row */
52
+ /* T 65555 */
53
+ /* R 76566 */
54
+ /* A 87667 */
55
+
56
+ /* Allocate memory for both rows */
57
+
58
+ prev_row = ALLOC_N(int, l1+1);
59
+ curr_row = ALLOC_N(int, l1+1);
60
+
61
+ if ((prev_row == NULL) || (curr_row == NULL)) {
62
+ rb_raise(rb_eNoMemError, "out of memory");
63
+ }
64
+
65
+ /* Initialize the current row. */
66
+
67
+ for (col=0; col<=l1; col++) {
68
+ curr_row[col] = col;
69
+ }
70
+
71
+ for (row=1; row<=l2; row++) {
72
+ /* Copy the current row to the previous row. */
73
+
74
+ memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
75
+
76
+ /* Calculate the values of the current row. */
77
+
78
+ curr_row[0] = row;
79
+ curr_row_min = row;
80
+
81
+ for (col=1; col<=l1; col++) {
82
+ /* Equal (cost=0) or substitution (cost=1). */
83
+
84
+ curr_row[col] = prev_row[col-1] + ((rb_str_cmp(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1)) == 0) ? 0 : 1);
85
+
86
+ /* Insertion if it's cheaper than substitution. */
87
+
88
+ if (prev_row[col]+1 < curr_row[col]) {
89
+ curr_row[col] = prev_row[col]+1;
90
+ }
91
+
92
+ /* Deletion if it's cheaper than substitution. */
93
+
94
+ if (curr_row[col-1]+1 < curr_row[col]) {
95
+ curr_row[col] = curr_row[col-1]+1;
96
+ }
97
+
98
+ /* Keep track of the minimum value on this row. */
99
+
100
+ if (curr_row[col] < curr_row_min) {
101
+ curr_row_min = curr_row[col];
102
+ }
103
+ }
104
+
105
+ /* Return nil as soon as we exceed the threshold. */
106
+
107
+ if (threshold > -1 && curr_row_min >= threshold) {
108
+ free(prev_row);
109
+ free(curr_row);
110
+
111
+ return Qnil;
112
+ }
113
+ }
114
+
115
+ /* The result is the last value on the last row. */
116
+
117
+ result = curr_row[l1];
118
+
119
+ free(prev_row);
120
+ free(curr_row);
121
+
122
+ /* Return the Ruby version of the result. */
123
+
124
+ return INT2FIX(result);
125
+ }
@@ -0,0 +1,21 @@
1
+ #include "ruby.h"
2
+
3
+ VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
+ if ((TYPE(rb_o1) == T_STRING) && (TYPE(rb_o2)) == T_STRING) {
5
+ return levenshtein_distance_string(self, rb_o1, rb_o2, rb_threshold);
6
+ } else if ((TYPE(rb_o1) == T_ARRAY) && (TYPE(rb_o2)) == T_ARRAY) {
7
+ if ((TYPE(rb_ary_entry(rb_o1, 0)) == T_STRING) && (TYPE(rb_ary_entry(rb_o2, 0))) == T_STRING) {
8
+ return levenshtein_distance_array_of_strings(self, rb_o1, rb_o2, rb_threshold);
9
+ } else {
10
+ return levenshtein_distance_array(self, rb_o1, rb_o2, rb_threshold);
11
+ }
12
+ } else {
13
+ return levenshtein_distance_generic(self, rb_o1, rb_o2, rb_threshold);
14
+ }
15
+ }
16
+
17
+ void Init_levenshtein_in_c() {
18
+ VALUE mLevenshtein = rb_define_module("Levenshtein");
19
+
20
+ rb_define_singleton_method(mLevenshtein, "levenshtein_distance_fast" , levenshtein_distance_fast, 3);
21
+ }
@@ -0,0 +1,129 @@
1
+ #include "ruby.h"
2
+
3
+ VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
+ int threshold;
5
+ int l1, l2;
6
+ int *prev_row, *curr_row;
7
+ int col, row;
8
+ int curr_row_min, result;
9
+ int offset;
10
+
11
+ ID id_length = rb_intern("length");
12
+ ID id_get = rb_intern("[]");
13
+ ID id_equal = rb_intern("==");
14
+
15
+ /* Get the sizes of both sequences. */
16
+
17
+ l1 = FIX2INT(rb_funcall(rb_o1, id_length, 0));
18
+ l2 = FIX2INT(rb_funcall(rb_o2, id_length, 0));
19
+
20
+ /* Convert Ruby's threshold to C's threshold. */
21
+
22
+ if (!NIL_P(rb_threshold)) {
23
+ threshold = FIX2INT(rb_threshold);
24
+ } else {
25
+ threshold = -1;
26
+ }
27
+
28
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
29
+
30
+ offset = 0;
31
+ while RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(offset)))) {
32
+ offset++;
33
+ }
34
+
35
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
36
+
37
+ while ((l1-1 > offset) && (l2-1 > offset) && RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(l1-1)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(l2-1))))) {
38
+ l1--;
39
+ l2--;
40
+ }
41
+
42
+ l1 -= offset;
43
+ l2 -= offset;
44
+
45
+ /* The Levenshtein algorithm itself. */
46
+
47
+ /* s1= */
48
+ /* ERIK */
49
+ /* */
50
+ /* 01234 */
51
+ /* s2=V 11234 */
52
+ /* E 21234 */
53
+ /* E 32234 */
54
+ /* N 43334 <- prev_row */
55
+ /* S 54444 <- curr_row */
56
+ /* T 65555 */
57
+ /* R 76566 */
58
+ /* A 87667 */
59
+
60
+ /* Allocate memory for both rows */
61
+
62
+ prev_row = ALLOC_N(int, l1+1);
63
+ curr_row = ALLOC_N(int, l1+1);
64
+
65
+ if ((prev_row == NULL) || (curr_row == NULL)) {
66
+ rb_raise(rb_eNoMemError, "out of memory");
67
+ }
68
+
69
+ /* Initialize the current row. */
70
+
71
+ for (col=0; col<=l1; col++) {
72
+ curr_row[col] = col;
73
+ }
74
+
75
+ for (row=1; row<=l2; row++) {
76
+ /* Copy the current row to the previous row. */
77
+
78
+ memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
79
+
80
+ /* Calculate the values of the current row. */
81
+
82
+ curr_row[0] = row;
83
+ curr_row_min = row;
84
+
85
+ for (col=1; col<=l1; col++) {
86
+ /* Equal (cost=0) or substitution (cost=1). */
87
+
88
+ curr_row[col] = prev_row[col-1] + (RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset+col-1)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(offset+row-1)))) ? 0 : 1);
89
+
90
+ /* Insertion if it's cheaper than substitution. */
91
+
92
+ if (prev_row[col]+1 < curr_row[col]) {
93
+ curr_row[col] = prev_row[col]+1;
94
+ }
95
+
96
+ /* Deletion if it's cheaper than substitution. */
97
+
98
+ if (curr_row[col-1]+1 < curr_row[col]) {
99
+ curr_row[col] = curr_row[col-1]+1;
100
+ }
101
+
102
+ /* Keep track of the minimum value on this row. */
103
+
104
+ if (curr_row[col] < curr_row_min) {
105
+ curr_row_min = curr_row[col];
106
+ }
107
+ }
108
+
109
+ /* Return nil as soon as we exceed the threshold. */
110
+
111
+ if (threshold > -1 && curr_row_min >= threshold) {
112
+ free(prev_row);
113
+ free(curr_row);
114
+
115
+ return Qnil;
116
+ }
117
+ }
118
+
119
+ /* The result is the last value on the last row. */
120
+
121
+ result = curr_row[l1];
122
+
123
+ free(prev_row);
124
+ free(curr_row);
125
+
126
+ /* Return the Ruby version of the result. */
127
+
128
+ return INT2FIX(result);
129
+ }
@@ -0,0 +1,133 @@
1
+ #include "ruby.h"
2
+
3
+ VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
4
+ int threshold;
5
+ long l1, l2;
6
+ int *prev_row, *curr_row;
7
+ int col, row;
8
+ int curr_row_min, result;
9
+ int offset;
10
+ char *s1, *s2;
11
+
12
+ /* Convert Ruby's s1 to C's s1. */
13
+
14
+ rb_o1 = StringValue(rb_o1);
15
+ s1 = RSTRING_PTR(RSTRING(rb_o1));
16
+ l1 = RSTRING_LEN(RSTRING(rb_o1));
17
+
18
+ /* Convert Ruby's s2 to C's s2. */
19
+
20
+ rb_o2 = StringValue(rb_o2);
21
+ s2 = RSTRING_PTR(RSTRING(rb_o2));
22
+ l2 = RSTRING_LEN(RSTRING(rb_o2));
23
+
24
+ /* Convert Ruby's threshold to C's threshold. */
25
+
26
+ if (!NIL_P(rb_threshold)) {
27
+ threshold = FIX2INT(rb_threshold);
28
+ } else {
29
+ threshold = -1;
30
+ }
31
+
32
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
33
+
34
+ offset = 0;
35
+ while (s1[offset] == s2[offset]) {
36
+ offset++;
37
+ }
38
+
39
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
40
+
41
+ while ((l1-1 > offset) && (l2-1 > offset) && (s1[l1-1] == s2[l2-1])) {
42
+ l1--;
43
+ l2--;
44
+ }
45
+
46
+ l1 -= offset;
47
+ l2 -= offset;
48
+
49
+ /* The Levenshtein algorithm itself. */
50
+
51
+ /* s1= */
52
+ /* ERIK */
53
+ /* */
54
+ /* 01234 */
55
+ /* s2=V 11234 */
56
+ /* E 21234 */
57
+ /* E 32234 */
58
+ /* N 43334 <- prev_row */
59
+ /* S 54444 <- curr_row */
60
+ /* T 65555 */
61
+ /* R 76566 */
62
+ /* A 87667 */
63
+
64
+ /* Allocate memory for both rows */
65
+
66
+ prev_row = ALLOC_N(int, l1+1);
67
+ curr_row = ALLOC_N(int, l1+1);
68
+
69
+ if ((prev_row == NULL) || (curr_row == NULL)) {
70
+ rb_raise(rb_eNoMemError, "out of memory");
71
+ }
72
+
73
+ /* Initialize the current row. */
74
+
75
+ for (col=0; col<=l1; col++) {
76
+ curr_row[col] = col;
77
+ }
78
+
79
+ for (row=1; row<=l2; row++) {
80
+ /* Copy the current row to the previous row. */
81
+
82
+ memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
83
+
84
+ /* Calculate the values of the current row. */
85
+
86
+ curr_row[0] = row;
87
+ curr_row_min = row;
88
+
89
+ for (col=1; col<=l1; col++) {
90
+ /* Equal (cost=0) or substitution (cost=1). */
91
+
92
+ curr_row[col] = prev_row[col-1] + ((s1[offset+col-1] == s2[offset+row-1]) ? 0 : 1);
93
+
94
+ /* Insertion if it's cheaper than substitution. */
95
+
96
+ if (prev_row[col]+1 < curr_row[col]) {
97
+ curr_row[col] = prev_row[col]+1;
98
+ }
99
+
100
+ /* Deletion if it's cheaper than substitution. */
101
+
102
+ if (curr_row[col-1]+1 < curr_row[col]) {
103
+ curr_row[col] = curr_row[col-1]+1;
104
+ }
105
+
106
+ /* Keep track of the minimum value on this row. */
107
+
108
+ if (curr_row[col] < curr_row_min) {
109
+ curr_row_min = curr_row[col];
110
+ }
111
+ }
112
+
113
+ /* Return nil as soon as we exceed the threshold. */
114
+
115
+ if (threshold > -1 && curr_row_min >= threshold) {
116
+ free(prev_row);
117
+ free(curr_row);
118
+
119
+ return Qnil;
120
+ }
121
+ }
122
+
123
+ /* The result is the last value on the last row. */
124
+
125
+ result = curr_row[l1];
126
+
127
+ free(prev_row);
128
+ free(curr_row);
129
+
130
+ /* Return the Ruby version of the result. */
131
+
132
+ return INT2FIX(result);
133
+ }
@@ -0,0 +1,20 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "levenshtein/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "levenshtein-extended"
7
+ s.version = Levenshtein::VERSION
8
+ s.authors = ["Esdras Mayrink"]
9
+ s.email = ["falecom@oesdras.com.br"]
10
+ s.homepage = ""
11
+ s.summary = "fast string edit distance computation, using the Damerau-Levenshtein algorithm"
12
+ s.description = "The levenshtein module implements fast Damerau-Levenshtein edit distance computation in O(n) memory and O(n^2) time, using a C wrapper."
13
+
14
+ s.rubyforge_project = "levenshtein"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+ end
@@ -0,0 +1,108 @@
1
+ begin
2
+ require "levenshtein/levenshtein_in_c" # If compiled by RubyGems.
3
+ rescue LoadError
4
+ begin
5
+ require "levenshtein_in_c" # If compiled by the build script.
6
+ rescue LoadError
7
+ $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.distance. Using the much slower Ruby version instead."
8
+ end
9
+ end
10
+
11
+ # The Levenshtein distance is a metric for measuring the amount
12
+ # of difference between two sequences (i.e., the so called edit
13
+ # distance). The Levenshtein distance between two sequences is
14
+ # given by the minimum number of operations needed to transform
15
+ # one sequence into the other, where an operation is an
16
+ # insertion, deletion, or substitution of a single element.
17
+ #
18
+ # More information about the Levenshtein distance algorithm:
19
+ # http://en.wikipedia.org/wiki/Levenshtein_distance .
20
+
21
+ module Levenshtein
22
+
23
+ # Returns the Levenshtein distance as a number between 0.0 and
24
+ # 1.0. It's basically the Levenshtein distance divided by the
25
+ # length of the longest sequence.
26
+
27
+ def self.normalized_distance(s1, s2, threshold=nil)
28
+ s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
29
+
30
+ if s2.length == 0
31
+ 0.0 # Since s1.length < s2.length, s1 must be empty as well.
32
+ else
33
+ if threshold
34
+ if d = self.distance(s1, s2, (threshold*s2.length+1).to_i)
35
+ d.to_f/s2.length
36
+ else
37
+ nil
38
+ end
39
+ else
40
+ self.distance(s1, s2).to_f/s2.length
41
+ end
42
+ end
43
+ end
44
+
45
+ # Returns the Levenshtein distance between two sequences.
46
+ #
47
+ # The two sequences can be two strings, two arrays, or two other
48
+ # objects. Strings, arrays and arrays of strings are handled with
49
+ # optimized (very fast) C code. All other sequences are handled
50
+ # with generic (fast) C code.
51
+ #
52
+ # The sequences should respond to :length and :[] and all objects
53
+ # in the sequences (as returned by []) should response to :==.
54
+
55
+ def self.distance(s1, s2, threshold=nil)
56
+ s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
57
+
58
+ # Handle some basic circumstances.
59
+
60
+ return 0 if s1 == s2
61
+ return s2.length if s1.length == 0
62
+
63
+ if threshold
64
+ return nil if (s2.length-s1.length) >= threshold
65
+
66
+ a1, a2 = nil, nil
67
+ a1, a2 = s1, s2 if s1.respond_to?(:-) and s2.respond_to?(:-)
68
+ a1, a2 = s1.scan(/./), s2.scan(/./) if s1.respond_to?(:scan) and s2.respond_to?(:scan)
69
+
70
+ if a1 and a2
71
+ return nil if (a1-a2).length >= threshold
72
+ return nil if (a2-a1).length >= threshold
73
+ end
74
+ end
75
+
76
+ distance_fast_or_slow(s1, s2, threshold)
77
+ end
78
+
79
+ def self.distance_fast_or_slow(s1, s2, threshold) # :nodoc:
80
+ if respond_to?(:levenshtein_distance_fast)
81
+ levenshtein_distance_fast(s1, s2, threshold) # Implemented in C.
82
+ else
83
+ levenshtein_distance_slow(s1, s2, threshold) # Implemented in Ruby.
84
+ end
85
+ end
86
+
87
+ def self.levenshtein_distance_slow(s1, s2, threshold) # :nodoc:
88
+ row = (0..s1.length).to_a
89
+
90
+ 1.upto(s2.length) do |y|
91
+ prow = row
92
+ row = [y]
93
+
94
+ 1.upto(s1.length) do |x|
95
+ row[x] = [prow[x]+1, row[x-1]+1, prow[x-1]+(s1[x-1]==s2[y-1] ? 0 : 1)].min
96
+ end
97
+
98
+ # Stop analysing this sequence as soon as the best possible
99
+ # result for this sequence is bigger than the best result so far.
100
+ # (The minimum value in the next row will be equal to or greater
101
+ # than the minimum value in this row.)
102
+
103
+ return nil if threshold and row.min >= threshold
104
+ end
105
+
106
+ row[-1]
107
+ end
108
+ end
@@ -0,0 +1,3 @@
1
+ module Levenshtein
2
+ VERSION = "0.0.1"
3
+ end
Binary file
metadata ADDED
@@ -0,0 +1,70 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: levenshtein-extended
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.0.1
6
+ platform: ruby
7
+ authors:
8
+ - Esdras Mayrink
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-07-04 00:00:00 -03:00
14
+ default_executable:
15
+ dependencies: []
16
+
17
+ description: The levenshtein module implements fast Damerau-Levenshtein edit distance computation in O(n) memory and O(n^2) time, using a C wrapper.
18
+ email:
19
+ - falecom@oesdras.com.br
20
+ executables: []
21
+
22
+ extensions: []
23
+
24
+ extra_rdoc_files: []
25
+
26
+ files:
27
+ - .gitignore
28
+ - Gemfile
29
+ - README
30
+ - Rakefile
31
+ - ext/levenshtein_in_c/extconf.rb
32
+ - ext/levenshtein_in_c/levenshtein_array.c
33
+ - ext/levenshtein_in_c/levenshtein_array_of_strings.c
34
+ - ext/levenshtein_in_c/levenshtein_fast.c
35
+ - ext/levenshtein_in_c/levenshtein_generic.c
36
+ - ext/levenshtein_in_c/levenshtein_string.c
37
+ - levenshtein.gemspec
38
+ - lib/levenshtein.rb
39
+ - lib/levenshtein/version.rb
40
+ - lib/levenshtein_in_c.bundle
41
+ has_rdoc: true
42
+ homepage: ""
43
+ licenses: []
44
+
45
+ post_install_message:
46
+ rdoc_options: []
47
+
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ version: "0"
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: "0"
62
+ requirements: []
63
+
64
+ rubyforge_project: levenshtein
65
+ rubygems_version: 1.5.2
66
+ signing_key:
67
+ specification_version: 3
68
+ summary: fast string edit distance computation, using the Damerau-Levenshtein algorithm
69
+ test_files: []
70
+