levenshtein 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE ADDED
@@ -0,0 +1,15 @@
1
+ # Copyright Erik Veenstra <levenshtein@erikveen.dds.nl>
2
+ #
3
+ # This program is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU General Public License,
5
+ # version 2, as published by the Free Software Foundation.
6
+ #
7
+ # This program is distributed in the hope that it will be
8
+ # useful, but WITHOUT ANY WARRANTY; without even the implied
9
+ # warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
10
+ # PURPOSE. See the GNU General Public License for more details.
11
+ #
12
+ # You should have received a copy of the GNU General Public
13
+ # License along with this program; if not, write to the Free
14
+ # Software Foundation, Inc., 59 Temple Place, Suite 330,
15
+ # Boston, MA 02111-1307 USA.
data/README ADDED
@@ -0,0 +1,8 @@
1
+ # The Levenshtein distance is a metric for measuring the amount of difference
2
+ # between two sequences (i.e., the so called edit distance). The Levenshtein
3
+ # distance between two strings is given by the minimum number of operations
4
+ # needed to transform one string into the other, where an operation is an
5
+ # insertion, deletion, or substitution of a single character.
6
+ #
7
+ # More information about the Levenshtein distance algorithm:
8
+ # http://en.wikipedia.org/wiki/Levenshtein_distance .
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,5 @@
1
+ require "mkmf"
2
+
3
+ dir_config("levenshtein")
4
+
5
+ create_makefile("levenshtein/levenshtein_c")
@@ -0,0 +1,122 @@
1
+ #include "ruby.h"
2
+
3
+ static VALUE levenshtein_distance_part2(VALUE self, VALUE rb_s1, VALUE rb_s2, VALUE rb_threshold) {
4
+ VALUE rb_s3;
5
+ int threshold;
6
+ int l1, l2, l3;
7
+ char *s1, *s2, *s3;
8
+ int *prev_row, *curr_row;
9
+ int col, row;
10
+ int curr_row_min, result;
11
+
12
+ /* Convert Ruby's s1 to C's s1. */
13
+
14
+ rb_s1 = StringValue(rb_s1);
15
+ s1 = RSTRING(rb_s1)->ptr;
16
+ l1 = RSTRING(rb_s1)->len;
17
+
18
+ /* Convert Ruby's s2 to C's s2. */
19
+
20
+ rb_s2 = StringValue(rb_s2);
21
+ s2 = RSTRING(rb_s2)->ptr;
22
+ l2 = RSTRING(rb_s2)->len;
23
+
24
+ /* Convert Ruby's threshold to C's threshold. */
25
+
26
+ if (!NIL_P(rb_threshold)) {
27
+ threshold = FIX2INT(rb_threshold);
28
+ } else {
29
+ threshold = -1;
30
+ }
31
+
32
+ /* The Levenshtein Algorithm itself. */
33
+
34
+ /* s1= */
35
+ /* ERIK */
36
+ /* */
37
+ /* 01234 */
38
+ /* s2=V 11234 */
39
+ /* E 21234 */
40
+ /* E 32234 */
41
+ /* N 43334 <- prev_row */
42
+ /* S 54444 <- curr_row */
43
+ /* T 65555 */
44
+ /* R 76566 */
45
+ /* A 87667 */
46
+
47
+ /* Allocate memory for both rows */
48
+
49
+ prev_row = ALLOC_N(int, l1+1);
50
+ curr_row = ALLOC_N(int, l1+1);
51
+
52
+ if ((prev_row == NULL) || (curr_row == NULL)) {
53
+ rb_raise(rb_eNoMemError, "out of memory");
54
+ }
55
+
56
+ /* Initialize the current row. */
57
+
58
+ for (col=0; col<=l1; col++) {
59
+ curr_row[col] = col;
60
+ }
61
+
62
+ for (row=1; row<=l2; row++) {
63
+ /* Copy the current row to the previous row. */
64
+
65
+ memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
66
+
67
+ /* Calculate the values of the current row. */
68
+
69
+ curr_row[0] = row;
70
+ curr_row_min = row;
71
+
72
+ for (col=1; col<=l1; col++) {
73
+ /* Equal (cost=0) or Substitution (cost=1). */
74
+
75
+ curr_row[col] = prev_row[col-1] + ((s1[col-1] == s2[row-1]) ? 0 : 1);
76
+
77
+ /* Insertion if it's cheaper than substitution. */
78
+
79
+ if (prev_row[col]+1 < curr_row[col]) {
80
+ curr_row[col] = prev_row[col]+1;
81
+ }
82
+
83
+ /* Deletion if it's cheaper than substitution. */
84
+
85
+ if (curr_row[col-1]+1 < curr_row[col]) {
86
+ curr_row[col] = curr_row[col-1]+1;
87
+ }
88
+
89
+ /* Keep track of the minimum value on this row. */
90
+
91
+ if (curr_row[col] < curr_row_min) {
92
+ curr_row_min = curr_row[col];
93
+ }
94
+ }
95
+
96
+ /* Return nil as soon as we exceed the threshold. */
97
+
98
+ if (threshold > -1 && curr_row_min >= threshold) {
99
+ free(prev_row);
100
+ free(curr_row);
101
+
102
+ return Qnil;
103
+ }
104
+ }
105
+
106
+ /* The result is the last value on the last row. */
107
+
108
+ result = curr_row[l1];
109
+
110
+ free(prev_row);
111
+ free(curr_row);
112
+
113
+ /* Return the Ruby version of the result. */
114
+
115
+ return INT2FIX(result);
116
+ }
117
+
118
+ void Init_levenshtein_c() {
119
+ VALUE mLevenshtein = rb_define_module("Levenshtein");
120
+
121
+ rb_define_singleton_method(mLevenshtein, "distance_part2_fast" , levenshtein_distance_part2, 3);
122
+ }
@@ -0,0 +1,100 @@
1
+ begin
2
+ require "levenshtein/levenshtein_c"
3
+ rescue LoadError
4
+ begin
5
+ require "levenshtein_c"
6
+ rescue LoadError
7
+ $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.distance_part2. Using the slow Ruby version instead."
8
+ end
9
+ end
10
+
11
+ # The Levenshtein distance is a metric for measuring the amount of difference
12
+ # between two sequences (i.e., the so called edit distance). The Levenshtein
13
+ # distance between two strings is given by the minimum number of operations
14
+ # needed to transform one string into the other, where an operation is an
15
+ # insertion, deletion, or substitution of a single character.
16
+ #
17
+ # More information about the Levenshtein distance algorithm:
18
+ # http://en.wikipedia.org/wiki/Levenshtein_distance .
19
+
20
+ module Levenshtein
21
+ # Returns the Levenshtein distance as a number bestween 0.0 and 1.0.
22
+ # It's basically the Levenshtein distance divided by the length of the longest string.
23
+
24
+ def self.normalized_distance(s1, s2, threshold=nil)
25
+ s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
26
+
27
+ if s2.empty?
28
+ 0.0 # Since s1.length < s2.length, s1 must be empty as well.
29
+ else
30
+ if threshold
31
+ if d = self.distance(s1, s2, (threshold*s2.length+1).to_i)
32
+ d.to_f/s2.length
33
+ else
34
+ nil
35
+ end
36
+ else
37
+ self.distance(s1, s2).to_f/s2.length
38
+ end
39
+ end
40
+ end
41
+
42
+ # Returns the Levenshtein distance between two byte strings.
43
+
44
+ def self.distance(s1, s2, threshold=nil)
45
+ s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
46
+
47
+ # Handle some basic circumstances.
48
+
49
+ return 0.0 if s1 == s2
50
+ return s2.length if s1.empty?
51
+ return nil if threshold and (s2.length-s1.length) >= threshold
52
+ return nil if threshold and (s1.scan(/./) - s2.scan(/./)).length >= threshold
53
+ return nil if threshold and (s2.scan(/./) - s1.scan(/./)).length >= threshold
54
+
55
+ # Do the expensive calculation on a subset of the strings only, if possible.
56
+
57
+ b = 0
58
+ e1 = s1.length-1
59
+ e2 = s2.length-1
60
+
61
+ while s1[b, 1] == s2[b, 1]
62
+ b += 1
63
+ end
64
+
65
+ while s1[e1, 1] == s2[e2, 1]
66
+ e1 -= 1
67
+ e2 -= 1
68
+ end
69
+
70
+ distance_part2(s1[b..e1], s2[b..e2], threshold)
71
+ end
72
+
73
+ def self.distance_part2(s1, s2, threshold) # :nodoc:
74
+ if respond_to?(:distance_part2_fast)
75
+ distance_part2_fast(s1, s2, threshold) # Implemented in C.
76
+ else
77
+ distance_part2_slow(s1, s2, threshold) # Implemented in Ruby.
78
+ end
79
+ end
80
+
81
+ def self.distance_part2_slow(s1, s2, threshold) # :nodoc:
82
+ row = (0..s1.length).to_a
83
+
84
+ 1.upto(s2.length) do |y|
85
+ prow = row
86
+ row = [y]
87
+
88
+ 1.upto(s1.length) do |x|
89
+ row[x] = [prow[x]+1, row[x-1]+1, prow[x-1]+(s1[x-1]==s2[y-1] ? 0 : 1)].min
90
+ end
91
+
92
+ # Stop analysing this string as soon as the best possible result for this string is bigger than the best result so far.
93
+ # (The minimum value in the next row will be equal to or greater than the minimum value in this row.)
94
+
95
+ return nil if threshold and row.min >= threshold
96
+ end
97
+
98
+ row[-1]
99
+ end
100
+ end
@@ -0,0 +1,90 @@
1
+ require "test/unit"
2
+ require "levenshtein"
3
+
4
+ class TestLevenshtein < Test::Unit::TestCase
5
+ def test_erik_veenstra
6
+ assert_equal(7, Levenshtein.distance("erik", "veenstra"))
7
+ assert_equal(7, Levenshtein.distance("veenstra", "erik"))
8
+
9
+ assert_in_delta(0.875, Levenshtein.normalized_distance("erik", "veenstra"), 0.01)
10
+ assert_in_delta(0.875, Levenshtein.normalized_distance("veenstra", "erik"), 0.01)
11
+ end
12
+
13
+ def test_empty_string
14
+ assert_equal(0, Levenshtein.distance("", ""))
15
+ assert_equal(3, Levenshtein.distance("", "foo"))
16
+ assert_equal(3, Levenshtein.distance("foo", ""))
17
+
18
+ assert_in_delta(0.0, Levenshtein.normalized_distance("", ""), 0.01)
19
+ assert_in_delta(1.0, Levenshtein.normalized_distance("", "foo"), 0.01)
20
+ assert_in_delta(1.0, Levenshtein.normalized_distance("foo", ""), 0.01)
21
+ end
22
+
23
+ def test_same_string
24
+ assert_equal(0, Levenshtein.distance("", ""))
25
+ assert_equal(0, Levenshtein.distance("foo", "foo"))
26
+
27
+ assert_in_delta(0.0, Levenshtein.normalized_distance("", ""), 0.01)
28
+ assert_in_delta(0.0, Levenshtein.normalized_distance("foo", "foo"), 0.01)
29
+ end
30
+
31
+ def test_threshold
32
+ assert_equal(3, Levenshtein.distance("foo", "foobar"))
33
+ assert_equal(nil, Levenshtein.distance("foo", "foobar", 2))
34
+
35
+ assert_in_delta(0.5, Levenshtein.normalized_distance("foo", "foobar"), 0.01)
36
+ assert_equal(nil, Levenshtein.normalized_distance("foo", "foobar", 0.30))
37
+ end
38
+
39
+ def test_same_head_and_or_tail
40
+ assert_equal(3, Levenshtein.distance("ab123cd", "abxyzcd"))
41
+ assert_equal(3, Levenshtein.distance("ab123", "abxyz"))
42
+ assert_equal(3, Levenshtein.distance("123cd", "xyzcd"))
43
+
44
+ assert_in_delta(0.42, Levenshtein.normalized_distance("ab123cd", "abxyzcd"), 0.01)
45
+ assert_in_delta(0.6, Levenshtein.normalized_distance("ab123", "abxyz"), 0.01)
46
+ assert_in_delta(0.6, Levenshtein.normalized_distance("123cd", "xyzcd"), 0.01)
47
+ end
48
+ end
49
+
50
+ class TestLevenshteinPart2Slow < Test::Unit::TestCase
51
+ def test_erik_veenstra
52
+ assert_equal(7, Levenshtein.distance_part2_slow("erik", "veenstra", nil))
53
+ end
54
+
55
+ def test_empty_string
56
+ assert_equal(0, Levenshtein.distance_part2_slow("", "", nil))
57
+ assert_equal(3, Levenshtein.distance_part2_slow("", "foo", nil))
58
+ end
59
+
60
+ def test_same_string
61
+ assert_equal(0, Levenshtein.distance_part2_slow("", "", nil))
62
+ assert_equal(0, Levenshtein.distance_part2_slow("foo", "foo", nil))
63
+ end
64
+
65
+ def test_threshold
66
+ assert_equal(3, Levenshtein.distance_part2_slow("foo", "foobar", nil))
67
+ assert_equal(nil, Levenshtein.distance_part2_slow("foo", "foobar", 2))
68
+ end
69
+ end
70
+
71
+ class TestLevenshteinPart2Fast < Test::Unit::TestCase
72
+ def test_erik_veenstra
73
+ assert_equal(7, Levenshtein.distance_part2_fast("erik", "veenstra", nil))
74
+ end
75
+
76
+ def test_empty_string
77
+ assert_equal(0, Levenshtein.distance_part2_fast("", "", nil))
78
+ assert_equal(3, Levenshtein.distance_part2_fast("", "foo", nil))
79
+ end
80
+
81
+ def test_same_string
82
+ assert_equal(0, Levenshtein.distance_part2_fast("", "", nil))
83
+ assert_equal(0, Levenshtein.distance_part2_fast("foo", "foo", nil))
84
+ end
85
+
86
+ def test_threshold
87
+ assert_equal(3, Levenshtein.distance_part2_fast("foo", "foobar", nil))
88
+ assert_equal(nil, Levenshtein.distance_part2_fast("foo", "foobar", 2))
89
+ end
90
+ end
metadata ADDED
@@ -0,0 +1,65 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: levenshtein
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Erik Veenstra
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-05-24 00:00:00 +02:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: Calculates the Levenshtein distance between two byte strings.
17
+ email: levenshtein@erikveen.dds.nl
18
+ executables: []
19
+
20
+ extensions:
21
+ - ext/levenshtein/extconf.rb
22
+ extra_rdoc_files: []
23
+
24
+ files:
25
+ - lib/levenshtein.rb
26
+ - ext/levenshtein
27
+ - ext/levenshtein/extconf.rb
28
+ - ext/levenshtein/levenshtein_c.c
29
+ - README
30
+ - LICENSE
31
+ - VERSION
32
+ has_rdoc: true
33
+ homepage: http://www.erikveen.dds.nl/levenshtein/index.html
34
+ post_install_message:
35
+ rdoc_options:
36
+ - README
37
+ - LICENSE
38
+ - VERSION
39
+ - --title
40
+ - levenshtein (0.1.0)
41
+ - --main
42
+ - README
43
+ require_paths:
44
+ - lib
45
+ required_ruby_version: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: "0"
50
+ version:
51
+ required_rubygems_version: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ version: "0"
56
+ version:
57
+ requirements: []
58
+
59
+ rubyforge_project: levenshtein
60
+ rubygems_version: 1.0.1
61
+ signing_key:
62
+ specification_version: 2
63
+ summary: Calculates the Levenshtein distance between two byte strings.
64
+ test_files:
65
+ - test/test.rb