levenshtein 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,4 +1,8 @@
1
- 0.2.1 (11-02-2012)
1
+ 0.2.2 (16-03-2012)
2
+
3
+ * Simplified code.
4
+
5
+ 0.2.1 (11-03-2012)
2
6
 
3
7
  * Better memory handling.
4
8
 
data/README CHANGED
@@ -1,12 +1,15 @@
1
- The Levenshtein distance is a metric for measuring the amount of difference
2
- between two sequences (i.e., the so called edit distance). The Levenshtein
3
- distance between two sequences is given by the minimum number of operations
4
- needed to transform one sequence into the other, where an operation is an
1
+ The Levenshtein distance is a metric for measuring the amount
2
+ of difference between two sequences (i.e., the so called edit
3
+ distance). The Levenshtein distance between two sequences is
4
+ given by the minimum number of operations needed to transform
5
+ one sequence into the other, where an operation is an
5
6
  insertion, deletion, or substitution of a single element.
6
7
 
7
- The two sequences can be two strings, two arrays, or two other objects.
8
- Strings, arrays and arrays of strings are handled with optimized (very fast) C
9
- code. All other sequences are handled with generic (fast) C code.
8
+ The two sequences can be two strings, two arrays, or two other
9
+ objects responding to :each. All sequences are by generic
10
+ (fast) C code.
11
+
12
+ All objects in the sequences should respond to :hash and :eql?.
10
13
 
11
14
  More information about the Levenshtein distance algorithm:
12
15
  http://en.wikipedia.org/wiki/Levenshtein_distance .
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.1
1
+ 0.2.2
@@ -2,17 +2,121 @@
2
2
  #include "levenshtein.h"
3
3
 
4
4
  VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
5
- if ((TYPE(rb_o1) == T_STRING) && (TYPE(rb_o2)) == T_STRING) {
6
- return levenshtein_distance_string(self, rb_o1, rb_o2, rb_threshold);
7
- } else if ((TYPE(rb_o1) == T_ARRAY) && (TYPE(rb_o2)) == T_ARRAY) {
8
- if ((TYPE(rb_ary_entry(rb_o1, 0)) == T_STRING) && (TYPE(rb_ary_entry(rb_o2, 0))) == T_STRING) {
9
- return levenshtein_distance_array_of_strings(self, rb_o1, rb_o2, rb_threshold);
10
- } else {
11
- return levenshtein_distance_array(self, rb_o1, rb_o2, rb_threshold);
12
- }
5
+ VALUE *p1, *p2;
6
+ long l1, l2;
7
+ long col, row;
8
+ int threshold;
9
+ int *prev_row, *curr_row, *temp_row;
10
+ int curr_row_min, result;
11
+ int value1, value2;
12
+
13
+ /* Be sure that all equivalent objects in rb_o1 and rb_o2 (a.eql?(b) == true) are taken from a pool (a.equal?(b) == true). */
14
+ /* This is done in levenshtein.rb by means of Util.pool. */
15
+
16
+ /* Get the sizes of both arrays. */
17
+
18
+ l1 = RARRAY_LEN(rb_o1);
19
+ l2 = RARRAY_LEN(rb_o2);
20
+
21
+ /* Get the pointers of both arrays. */
22
+
23
+ p1 = RARRAY_PTR(rb_o1);
24
+ p2 = RARRAY_PTR(rb_o2);
25
+
26
+ /* Convert Ruby's threshold to C's threshold. */
27
+
28
+ if (!NIL_P(rb_threshold)) {
29
+ threshold = FIX2INT(rb_threshold);
13
30
  } else {
14
- return levenshtein_distance_generic(self, rb_o1, rb_o2, rb_threshold);
31
+ threshold = -1;
32
+ }
33
+
34
+ /* The Levenshtein algorithm itself. */
35
+
36
+ /* s1= */
37
+ /* ERIK */
38
+ /* */
39
+ /* 01234 */
40
+ /* s2=V 11234 */
41
+ /* E 21234 */
42
+ /* E 32234 */
43
+ /* N 43334 <- prev_row */
44
+ /* S 54444 <- curr_row */
45
+ /* T 65555 */
46
+ /* R 76566 */
47
+ /* A 87667 */
48
+
49
+ /* Allocate memory for both rows */
50
+
51
+ prev_row = (int*) ALLOC_N(int, (l1+1));
52
+ curr_row = (int*) ALLOC_N(int, (l1+1));
53
+
54
+ /* Initialize the current row. */
55
+
56
+ for (col=0; col<=l1; col++) {
57
+ curr_row[col] = col;
15
58
  }
59
+
60
+ for (row=1; row<=l2; row++) {
61
+ /* Copy the current row to the previous row. */
62
+
63
+ temp_row = prev_row;
64
+ prev_row = curr_row;
65
+ curr_row = temp_row;
66
+
67
+ /* Calculate the values of the current row. */
68
+
69
+ curr_row[0] = row;
70
+ curr_row_min = row;
71
+
72
+ for (col=1; col<=l1; col++) {
73
+ /* Equal (cost=0) or substitution (cost=1). */
74
+
75
+ value1 = prev_row[col-1] + ((p1[col-1] == p2[row-1]) ? 0 : 1);
76
+
77
+ /* Insertion if it's cheaper than substitution. */
78
+
79
+ value2 = prev_row[col]+1;
80
+ if (value2 < value1) {
81
+ value1 = value2;
82
+ }
83
+
84
+ /* Deletion if it's cheaper than substitution. */
85
+
86
+ value2 = curr_row[col-1]+1;
87
+ if (value2 < value1) {
88
+ value1 = value2;
89
+ }
90
+
91
+ /* Keep track of the minimum value on this row. */
92
+
93
+ if (value1 < curr_row_min) {
94
+ curr_row_min = value1;
95
+ }
96
+
97
+ curr_row[col] = value1;
98
+ }
99
+
100
+ /* Return nil as soon as we exceed the threshold. */
101
+
102
+ if (threshold > -1 && curr_row_min >= threshold) {
103
+ free(prev_row);
104
+ free(curr_row);
105
+
106
+ return Qnil;
107
+ }
108
+ }
109
+
110
+ /* The result is the last value on the last row. */
111
+
112
+ result = curr_row[l1];
113
+
114
+ free(prev_row);
115
+ free(curr_row);
116
+
117
+ /* Return the Ruby version of the result. */
118
+
119
+ return INT2FIX(result);
16
120
  }
17
121
 
18
122
  void Init_levenshtein_fast() {
@@ -1,25 +1,30 @@
1
- require "levenshtein/exception"
1
+ # encoding: UTF-8
2
+
2
3
  require "levenshtein/version"
3
4
 
4
5
  module Levenshtein
5
6
  # Returns the Levenshtein distance as a number between 0.0 and
6
7
  # 1.0. It's basically the Levenshtein distance divided by the
7
- # length of the longest sequence.
8
+ # size of the longest sequence.
8
9
 
9
- def self.normalized_distance(a1, a2, threshold=nil)
10
- a1, a2 = a2, a1 if a1.length > a2.length # a1 is the short one; a2 is the long one.
10
+ def self.normalized_distance(a1, a2, threshold=nil, options={})
11
+ size = [a1.size, a2.size].max
11
12
 
12
- if a2.length == 0
13
- 0.0 # Since a1.length < a2.length, a1 must be empty as well.
13
+ if a1.size == 0 and a2.size == 0
14
+ 0.0
15
+ elsif a1.size == 0
16
+ a2.size.to_f/size
17
+ elsif a2.size == 0
18
+ a1.size.to_f/size
14
19
  else
15
20
  if threshold
16
- if d = self.distance(a1, a2, (threshold*a2.length+1).to_i)
17
- d.to_f/a2.length
21
+ if d = self.distance(a1, a2, (threshold*size).to_i+1)
22
+ d.to_f/size
18
23
  else
19
24
  nil
20
25
  end
21
26
  else
22
- self.distance(a1, a2).to_f/a2.length
27
+ self.distance(a1, a2).to_f/size
23
28
  end
24
29
  end
25
30
  end
@@ -27,71 +32,79 @@ module Levenshtein
27
32
  # Returns the Levenshtein distance between two sequences.
28
33
  #
29
34
  # The two sequences can be two strings, two arrays, or two other
30
- # objects. Strings, arrays and arrays of strings are handled with
31
- # optimized (very fast) C code. All other sequences are handled
32
- # with generic (fast) C code.
35
+ # objects responding to :each. All sequences are by generic
36
+ # (fast) C code.
33
37
  #
34
- # The sequences should respond to :length and :[] and all objects
35
- # in the sequences (as returned by []) should response to :==.
38
+ # All objects in the sequences should respond to :hash and :eql?.
36
39
 
37
- def self.distance(a1, a2, threshold=nil)
38
- a1, a2 = a2, a1 if a1.length > a2.length # a1 is the short one; a2 is the long one.
40
+ def self.distance(a1, a2, threshold=nil, options={})
41
+ a1, a2 = a1.scan(/./), a2.scan(/./) if String === a1 and String === a2
42
+ a1, a2 = Util.pool(a1, a2)
39
43
 
40
44
  # Handle some basic circumstances.
41
45
 
42
46
  return 0 if a1 == a2
43
- return a2.length if a1.length == 0
47
+ return a2.size if a1.empty?
48
+ return a1.size if a2.empty?
44
49
 
45
50
  if threshold
46
- return nil if (a2.length-a1.length) >= threshold
47
-
48
- a3, a4 = nil, nil
49
- a3, a4 = a1, a2 if a1.respond_to?(:-) and a2.respond_to?(:-)
50
- a3, a4 = a1.scan(/./), a2.scan(/./) if a1.respond_to?(:scan) and a2.respond_to?(:scan)
51
-
52
- if a3 and a4
53
- return nil if (a3-a4).length >= threshold
54
- return nil if (a4-a3).length >= threshold
55
- end
51
+ return nil if (a1.size-a2.size) >= threshold
52
+ return nil if (a2.size-a1.size) >= threshold
53
+ return nil if (a1-a2).size >= threshold
54
+ return nil if (a2-a1).size >= threshold
56
55
  end
57
56
 
58
- distance_fast_or_slow(a1, a2, threshold)
59
- end
60
-
61
- def self.distance_fast_or_slow(a1, a2, threshold) # :nodoc:
62
- if respond_to?(:distance_fast)
63
- distance_fast(a1, a2, threshold) # Implemented in C.
64
- else
65
- distance_slow(a1, a2, threshold) # Implemented in Ruby.
66
- end
67
- end
57
+ # Remove the common prefix and the common postfix.
68
58
 
69
- def self.distance_slow(a1, a2, threshold) # :nodoc:
70
- l1 = a1.length
71
- l2 = a2.length
59
+ l1 = a1.size
60
+ l2 = a2.size
72
61
 
73
- offset = 0
62
+ offset = 0
63
+ no_more_optimizations = true
74
64
 
75
- while offset < l1 and offset < l2 and a1[offset] == a2[offset]
65
+ while offset < l1 and offset < l2 and a1[offset].equal?(a2[offset])
76
66
  offset += 1
67
+
68
+ no_more_optimizations = false
77
69
  end
78
70
 
79
- while offset < l1 and offset < l2 and a1[l1-1] == a2[l2-1]
71
+ while offset < l1 and offset < l2 and a1[l1-1].equal?(a2[l2-1])
80
72
  l1 -= 1
81
73
  l2 -= 1
74
+
75
+ no_more_optimizations = false
82
76
  end
83
77
 
84
- l1 -= offset
85
- l2 -= offset
78
+ if no_more_optimizations
79
+ distance_fast_or_slow(a1, a2, threshold, options)
80
+ else
81
+ l1 -= offset
82
+ l2 -= offset
83
+
84
+ a1 = a1[offset, l1]
85
+ a2 = a2[offset, l2]
86
86
 
87
- crow = (0..l1).to_a
87
+ distance(a1, a2, threshold, options)
88
+ end
89
+ end
88
90
 
89
- 1.upto(l2) do |y|
91
+ def self.distance_fast_or_slow(a1, a2, threshold, options) # :nodoc:
92
+ if respond_to?(:distance_fast) and options[:force_slow]
93
+ distance_fast(a1, a2, threshold) # Implemented in C.
94
+ else
95
+ distance_slow(a1, a2, threshold) # Implemented in Ruby.
96
+ end
97
+ end
98
+
99
+ def self.distance_slow(a1, a2, threshold) # :nodoc:
100
+ crow = (0..a1.size).to_a
101
+
102
+ 1.upto(a2.size) do |y|
90
103
  prow = crow
91
104
  crow = [y]
92
105
 
93
- 1.upto(l1) do |x|
94
- crow[x] = [prow[x]+1, crow[x-1]+1, prow[x-1]+(a1[offset+x-1]==a2[offset+y-1] ? 0 : 1)].min
106
+ 1.upto(a1.size) do |x|
107
+ crow[x] = [prow[x]+1, crow[x-1]+1, prow[x-1]+(a1[x-1].equal?(a2[y-1]) ? 0 : 1)].min
95
108
  end
96
109
 
97
110
  # Stop analysing this sequence as soon as the best possible
@@ -104,6 +117,24 @@ module Levenshtein
104
117
 
105
118
  crow[-1]
106
119
  end
120
+
121
+ module Util # :nodoc:
122
+ def self.pool(*args)
123
+ # So we can compare pointers instead of objects (equal?() instead of ==()).
124
+
125
+ pool = {}
126
+
127
+ args.collect do |arg|
128
+ a = []
129
+
130
+ arg.each do |o|
131
+ a << pool[o] ||= o
132
+ end
133
+
134
+ a
135
+ end
136
+ end
137
+ end
107
138
  end
108
139
 
109
140
  begin
@@ -1,3 +1,5 @@
1
+ # encoding: UTF-8
2
+
1
3
  module Levenshtein
2
- VERSION = "0.2.1"
4
+ VERSION = "0.2.2"
3
5
  end
@@ -1,3 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+
1
4
  require "test/unit"
2
5
  require "levenshtein"
3
6
 
@@ -7,14 +10,10 @@ module Levenshtein
7
10
  @sequence = o
8
11
  end
9
12
 
10
- def length
11
- @sequence.length
12
- end
13
-
14
- def [](pos)
15
- raise "type not allowed [#{pos.inspect}]" unless pos.kind_of?(Fixnum)
16
-
17
- @sequence[pos]
13
+ def each
14
+ @sequence.length.times do |pos|
15
+ yield(@sequence[pos])
16
+ end
18
17
  end
19
18
  end
20
19
 
@@ -24,14 +23,18 @@ module Levenshtein
24
23
  def initialize(o)
25
24
  @object = o
26
25
  end
27
-
28
- def ==(other)
29
- @object == other.object
26
+
27
+ def hash
28
+ @object.hash
29
+ end
30
+
31
+ def eql?(other)
32
+ @object.eql?(other.object)
30
33
  end
31
34
  end
32
35
  end
33
36
 
34
- class TestLevenshteinString < Test::Unit::TestCase
37
+ class TestLevenshtein < Test::Unit::TestCase
35
38
  def test_erik_veenstra
36
39
  assert_equal(7, Levenshtein.distance("erik", "veenstra"))
37
40
  assert_equal(7, Levenshtein.distance("veenstra", "erik"))
@@ -79,59 +82,73 @@ class TestLevenshteinString < Test::Unit::TestCase
79
82
  assert_in_delta(0.6, Levenshtein.normalized_distance("123cd", "xyzcd"), 0.01)
80
83
  assert_in_delta(0.625, Levenshtein.normalized_distance("123cd123", "123"), 0.01)
81
84
  end
82
- end
83
85
 
84
- class TestLevenshteinArray < Test::Unit::TestCase
85
- def test_erik_veenstra
86
- x = lambda{|s| s.scan(/./).collect{|e| Levenshtein::TestElement.new(e)}}
86
+ def test_interface
87
+ seq1 = Levenshtein::TestSequence.new("erik".scan(/./).collect{|e| Levenshtein::TestElement.new(e)})
88
+ seq2 = Levenshtein::TestSequence.new("veenstra".scan(/./).collect{|e| Levenshtein::TestElement.new(e)})
87
89
 
88
- assert_equal(7, Levenshtein.distance(x["erik"], x["veenstra"]))
90
+ assert_equal(7, Levenshtein.distance(seq1, seq2))
89
91
  end
90
92
  end
91
93
 
92
- class TestLevenshteinArrayOfStrings < Test::Unit::TestCase
94
+ class TestLevenshteinFast < Test::Unit::TestCase
93
95
  def test_erik_veenstra
94
- x = lambda{|s| s.scan(/./)}
96
+ assert_equal(7, Levenshtein.distance("erik", "veenstra", nil, :force_slow=>false))
97
+ assert_equal(7, Levenshtein.distance("veenstra", "erik", nil, :force_slow=>false))
98
+ end
95
99
 
96
- assert_equal(7, Levenshtein.distance(x["erik"], x["veenstra"]))
100
+ def test_empty_string
101
+ assert_equal(0, Levenshtein.distance("", "", nil, :force_slow=>false))
102
+ assert_equal(3, Levenshtein.distance("", "foo", nil, :force_slow=>false))
103
+ assert_equal(3, Levenshtein.distance("foo", "", nil, :force_slow=>false))
97
104
  end
98
- end
99
105
 
100
- class TestLevenshteinGeneric < Test::Unit::TestCase
101
- def test_erik_veenstra
102
- x = lambda{|s| Levenshtein::TestSequence.new(s.scan(/./).collect{|e| Levenshtein::TestElement.new(e)})}
106
+ def test_same_string
107
+ assert_equal(0, Levenshtein.distance("", "", nil, :force_slow=>false))
108
+ assert_equal(0, Levenshtein.distance("foo", "foo", nil, :force_slow=>false))
109
+ end
103
110
 
104
- assert_equal(7, Levenshtein.distance(x["erik"], x["veenstra"]))
111
+ def test_threshold
112
+ assert_equal(3, Levenshtein.distance("foo", "foobar", nil, :force_slow=>false))
113
+ assert_equal(3, Levenshtein.distance("foo", "foobar", 4, :force_slow=>false))
114
+ assert_equal(nil, Levenshtein.distance("foo", "foobar", 2, :force_slow=>false))
115
+ end
116
+
117
+ def test_same_head_and_or_tail
118
+ assert_equal(3, Levenshtein.distance("ab123cd", "abxyzcd", nil, :force_slow=>false))
119
+ assert_equal(3, Levenshtein.distance("ab123", "abxyz", nil, :force_slow=>false))
120
+ assert_equal(3, Levenshtein.distance("123cd", "xyzcd", nil, :force_slow=>false))
121
+ assert_equal(5, Levenshtein.distance("123cd123", "123", nil, :force_slow=>false))
105
122
  end
106
123
  end
107
124
 
108
125
  class TestLevenshteinSlow < Test::Unit::TestCase
109
126
  def test_erik_veenstra
110
- assert_equal(7, Levenshtein.distance_slow("erik", "veenstra", nil))
111
- assert_equal(7, Levenshtein.distance_slow("veenstra", "erik", nil))
127
+ assert_equal(7, Levenshtein.distance("erik", "veenstra", nil, :force_slow=>true))
128
+ assert_equal(7, Levenshtein.distance("veenstra", "erik", nil, :force_slow=>true))
112
129
  end
113
130
 
114
131
  def test_empty_string
115
- assert_equal(0, Levenshtein.distance_slow("", "", nil))
116
- assert_equal(3, Levenshtein.distance_slow("", "foo", nil))
117
- assert_equal(3, Levenshtein.distance_slow("foo", "", nil))
132
+ assert_equal(0, Levenshtein.distance("", "", nil, :force_slow=>true))
133
+ assert_equal(3, Levenshtein.distance("", "foo", nil, :force_slow=>true))
134
+ assert_equal(3, Levenshtein.distance("foo", "", nil, :force_slow=>true))
118
135
  end
119
136
 
120
137
  def test_same_string
121
- assert_equal(0, Levenshtein.distance_slow("", "", nil))
122
- assert_equal(0, Levenshtein.distance_slow("foo", "foo", nil))
138
+ assert_equal(0, Levenshtein.distance("", "", nil, :force_slow=>true))
139
+ assert_equal(0, Levenshtein.distance("foo", "foo", nil, :force_slow=>true))
123
140
  end
124
141
 
125
142
  def test_threshold
126
- assert_equal(3, Levenshtein.distance_slow("foo", "foobar", nil))
127
- assert_equal(3, Levenshtein.distance_slow("foo", "foobar", 4))
128
- assert_equal(nil, Levenshtein.distance_slow("foo", "foobar", 2))
143
+ assert_equal(3, Levenshtein.distance("foo", "foobar", nil, :force_slow=>true))
144
+ assert_equal(3, Levenshtein.distance("foo", "foobar", 4, :force_slow=>true))
145
+ assert_equal(nil, Levenshtein.distance("foo", "foobar", 2, :force_slow=>true))
129
146
  end
130
147
 
131
148
  def test_same_head_and_or_tail
132
- assert_equal(3, Levenshtein.distance_slow("ab123cd", "abxyzcd", nil))
133
- assert_equal(3, Levenshtein.distance_slow("ab123", "abxyz", nil))
134
- assert_equal(3, Levenshtein.distance_slow("123cd", "xyzcd", nil))
135
- assert_equal(5, Levenshtein.distance_slow("123cd123", "123", nil))
149
+ assert_equal(3, Levenshtein.distance("ab123cd", "abxyzcd", nil, :force_slow=>true))
150
+ assert_equal(3, Levenshtein.distance("ab123", "abxyz", nil, :force_slow=>true))
151
+ assert_equal(3, Levenshtein.distance("123cd", "xyzcd", nil, :force_slow=>true))
152
+ assert_equal(5, Levenshtein.distance("123cd123", "123", nil, :force_slow=>true))
136
153
  end
137
154
  end
metadata CHANGED
@@ -1,41 +1,27 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: levenshtein
3
- version: !ruby/object:Gem::Version
4
- hash: 21
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.2
5
5
  prerelease:
6
- segments:
7
- - 0
8
- - 2
9
- - 1
10
- version: 0.2.1
11
6
  platform: ruby
12
- authors:
7
+ authors:
13
8
  - Erik Veenstra
14
9
  autorequire:
15
10
  bindir: bin
16
11
  cert_chain: []
17
-
18
- date: 2012-02-11 00:00:00 Z
12
+ date: 2012-03-16 00:00:00.000000000 Z
19
13
  dependencies: []
20
-
21
14
  description: Calculates the Levenshtein distance between two byte strings.
22
15
  email: levenshtein@erikveen.dds.nl
23
16
  executables: []
24
-
25
- extensions:
17
+ extensions:
26
18
  - ext/levenshtein/extconf.rb
27
19
  extra_rdoc_files: []
28
-
29
- files:
30
- - lib/levenshtein/exception.rb
20
+ files:
31
21
  - lib/levenshtein/version.rb
32
22
  - lib/levenshtein.rb
33
- - ext/levenshtein/levenshtein_string.c
34
- - ext/levenshtein/levenshtein_generic.c
35
23
  - ext/levenshtein/levenshtein.h
36
24
  - ext/levenshtein/levenshtein_fast.c
37
- - ext/levenshtein/levenshtein_array_of_strings.c
38
- - ext/levenshtein/levenshtein_array.c
39
25
  - ext/levenshtein/extconf.rb
40
26
  - README
41
27
  - LICENSE
@@ -44,43 +30,35 @@ files:
44
30
  - test/test.rb
45
31
  homepage: http://www.erikveen.dds.nl/levenshtein/index.html
46
32
  licenses: []
47
-
48
33
  post_install_message:
49
- rdoc_options:
34
+ rdoc_options:
50
35
  - README
51
36
  - LICENSE
52
37
  - VERSION
53
38
  - CHANGELOG
54
39
  - --title
55
- - levenshtein (0.2.1)
40
+ - levenshtein (0.2.2)
56
41
  - --main
57
42
  - README
58
- require_paths:
43
+ require_paths:
59
44
  - lib
60
- required_ruby_version: !ruby/object:Gem::Requirement
45
+ required_ruby_version: !ruby/object:Gem::Requirement
61
46
  none: false
62
- requirements:
63
- - - ">="
64
- - !ruby/object:Gem::Version
65
- hash: 3
66
- segments:
67
- - 0
68
- version: "0"
69
- required_rubygems_version: !ruby/object:Gem::Requirement
47
+ requirements:
48
+ - - ! '>='
49
+ - !ruby/object:Gem::Version
50
+ version: '0'
51
+ required_rubygems_version: !ruby/object:Gem::Requirement
70
52
  none: false
71
- requirements:
72
- - - ">="
73
- - !ruby/object:Gem::Version
74
- hash: 3
75
- segments:
76
- - 0
77
- version: "0"
53
+ requirements:
54
+ - - ! '>='
55
+ - !ruby/object:Gem::Version
56
+ version: '0'
78
57
  requirements: []
79
-
80
58
  rubyforge_project: levenshtein
81
- rubygems_version: 1.8.12
59
+ rubygems_version: 1.8.18
82
60
  signing_key:
83
61
  specification_version: 3
84
62
  summary: Calculates the Levenshtein distance between two byte strings.
85
- test_files:
63
+ test_files:
86
64
  - test/test.rb
@@ -1,130 +0,0 @@
1
- #include "ruby.h"
2
- #include "levenshtein.h"
3
-
4
- VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
5
- int threshold;
6
- int l1, l2;
7
- int *prev_row, *curr_row, *temp_row;
8
- int col, row;
9
- int curr_row_min, result;
10
- int offset;
11
- int value1, value2;
12
-
13
- /* Get the sizes of both arrays. */
14
-
15
- l1 = RARRAY_LEN(rb_o1);
16
- l2 = RARRAY_LEN(rb_o2);
17
-
18
- /* Convert Ruby's threshold to C's threshold. */
19
-
20
- if (!NIL_P(rb_threshold)) {
21
- threshold = FIX2INT(rb_threshold);
22
- } else {
23
- threshold = -1;
24
- }
25
-
26
- /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
27
-
28
- offset = 0;
29
-
30
- while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)))) {
31
- offset++;
32
- }
33
-
34
- /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
35
-
36
- while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)))) {
37
- l1--;
38
- l2--;
39
- }
40
-
41
- l1 -= offset;
42
- l2 -= offset;
43
-
44
- /* The Levenshtein algorithm itself. */
45
-
46
- /* s1= */
47
- /* ERIK */
48
- /* */
49
- /* 01234 */
50
- /* s2=V 11234 */
51
- /* E 21234 */
52
- /* E 32234 */
53
- /* N 43334 <- prev_row */
54
- /* S 54444 <- curr_row */
55
- /* T 65555 */
56
- /* R 76566 */
57
- /* A 87667 */
58
-
59
- /* Allocate memory for both rows */
60
-
61
- prev_row = (int*) ALLOC_N(int, (l1+1));
62
- curr_row = (int*) ALLOC_N(int, (l1+1));
63
-
64
- /* Initialize the current row. */
65
-
66
- for (col=0; col<=l1; col++) {
67
- curr_row[col] = col;
68
- }
69
-
70
- for (row=1; row<=l2; row++) {
71
- /* Copy the current row to the previous row. */
72
-
73
- temp_row = prev_row;
74
- prev_row = curr_row;
75
- curr_row = temp_row;
76
-
77
- /* Calculate the values of the current row. */
78
-
79
- curr_row[0] = row;
80
- curr_row_min = row;
81
-
82
- for (col=1; col<=l1; col++) {
83
- /* Equal (cost=0) or substitution (cost=1). */
84
-
85
- value1 = prev_row[col-1] + (RTEST(rb_equal(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1))) ? 0 : 1);
86
-
87
- /* Insertion if it's cheaper than substitution. */
88
-
89
- value2 = prev_row[col]+1;
90
- if (value2 < value1) {
91
- value1 = value2;
92
- }
93
-
94
- /* Deletion if it's cheaper than substitution. */
95
-
96
- value2 = curr_row[col-1]+1;
97
- if (value2 < value1) {
98
- value1 = value2;
99
- }
100
-
101
- /* Keep track of the minimum value on this row. */
102
-
103
- if (value1 < curr_row_min) {
104
- curr_row_min = value1;
105
- }
106
-
107
- curr_row[col] = value1;
108
- }
109
-
110
- /* Return nil as soon as we exceed the threshold. */
111
-
112
- if (threshold > -1 && curr_row_min >= threshold) {
113
- free(prev_row);
114
- free(curr_row);
115
-
116
- return Qnil;
117
- }
118
- }
119
-
120
- /* The result is the last value on the last row. */
121
-
122
- result = curr_row[l1];
123
-
124
- free(prev_row);
125
- free(curr_row);
126
-
127
- /* Return the Ruby version of the result. */
128
-
129
- return INT2FIX(result);
130
- }
@@ -1,130 +0,0 @@
1
- #include "ruby.h"
2
- #include "levenshtein.h"
3
-
4
- VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
5
- int threshold;
6
- int l1, l2;
7
- int *prev_row, *curr_row, *temp_row;
8
- int col, row;
9
- int curr_row_min, result;
10
- int offset;
11
- int value1, value2;
12
-
13
- /* Get the sizes of both arrays. */
14
-
15
- l1 = RARRAY_LEN(rb_o1);
16
- l2 = RARRAY_LEN(rb_o2);
17
-
18
- /* Convert Ruby's threshold to C's threshold. */
19
-
20
- if (!NIL_P(rb_threshold)) {
21
- threshold = FIX2INT(rb_threshold);
22
- } else {
23
- threshold = -1;
24
- }
25
-
26
- /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
27
-
28
- offset = 0;
29
-
30
- while ((offset < l1) && (offset < l2) && (rb_str_cmp(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)) == 0)) {
31
- offset++;
32
- }
33
-
34
- /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
35
-
36
- while ((offset < l1) && (offset < l2) && (rb_str_cmp(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)) == 0 )) {
37
- l1--;
38
- l2--;
39
- }
40
-
41
- l1 -= offset;
42
- l2 -= offset;
43
-
44
- /* The Levenshtein algorithm itself. */
45
-
46
- /* s1= */
47
- /* ERIK */
48
- /* */
49
- /* 01234 */
50
- /* s2=V 11234 */
51
- /* E 21234 */
52
- /* E 32234 */
53
- /* N 43334 <- prev_row */
54
- /* S 54444 <- curr_row */
55
- /* T 65555 */
56
- /* R 76566 */
57
- /* A 87667 */
58
-
59
- /* Allocate memory for both rows */
60
-
61
- prev_row = (int*) ALLOC_N(int, (l1+1));
62
- curr_row = (int*) ALLOC_N(int, (l1+1));
63
-
64
- /* Initialize the current row. */
65
-
66
- for (col=0; col<=l1; col++) {
67
- curr_row[col] = col;
68
- }
69
-
70
- for (row=1; row<=l2; row++) {
71
- /* Copy the current row to the previous row. */
72
-
73
- temp_row = prev_row;
74
- prev_row = curr_row;
75
- curr_row = temp_row;
76
-
77
- /* Calculate the values of the current row. */
78
-
79
- curr_row[0] = row;
80
- curr_row_min = row;
81
-
82
- for (col=1; col<=l1; col++) {
83
- /* Equal (cost=0) or substitution (cost=1). */
84
-
85
- value1 = prev_row[col-1] + ((rb_str_cmp(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1)) == 0) ? 0 : 1);
86
-
87
- /* Insertion if it's cheaper than substitution. */
88
-
89
- value2 = prev_row[col]+1;
90
- if (value2 < value1) {
91
- value1 = value2;
92
- }
93
-
94
- /* Deletion if it's cheaper than substitution. */
95
-
96
- value2 = curr_row[col-1]+1;
97
- if (value2 < value1) {
98
- value1 = value2;
99
- }
100
-
101
- /* Keep track of the minimum value on this row. */
102
-
103
- if (value1 < curr_row_min) {
104
- curr_row_min = value1;
105
- }
106
-
107
- curr_row[col] = value1;
108
- }
109
-
110
- /* Return nil as soon as we exceed the threshold. */
111
-
112
- if (threshold > -1 && curr_row_min >= threshold) {
113
- free(prev_row);
114
- free(curr_row);
115
-
116
- return Qnil;
117
- }
118
- }
119
-
120
- /* The result is the last value on the last row. */
121
-
122
- result = curr_row[l1];
123
-
124
- free(prev_row);
125
- free(curr_row);
126
-
127
- /* Return the Ruby version of the result. */
128
-
129
- return INT2FIX(result);
130
- }
@@ -1,133 +0,0 @@
1
- #include "ruby.h"
2
- #include "levenshtein.h"
3
-
4
- VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
5
- int threshold;
6
- int l1, l2;
7
- int *prev_row, *curr_row, *temp_row;
8
- int col, row;
9
- int curr_row_min, result;
10
- int offset;
11
- int value1, value2;
12
-
13
- ID id_length = rb_intern("length");
14
- ID id_get = rb_intern("[]");
15
-
16
- /* Get the sizes of both sequences. */
17
-
18
- l1 = FIX2INT(rb_funcall(rb_o1, id_length, 0));
19
- l2 = FIX2INT(rb_funcall(rb_o2, id_length, 0));
20
-
21
- /* Convert Ruby's threshold to C's threshold. */
22
-
23
- if (!NIL_P(rb_threshold)) {
24
- threshold = FIX2INT(rb_threshold);
25
- } else {
26
- threshold = -1;
27
- }
28
-
29
- /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
30
-
31
- offset = 0;
32
-
33
- while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset)), rb_funcall(rb_o2, id_get, 1, INT2FIX(offset))))) {
34
- offset++;
35
- }
36
-
37
- /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
38
-
39
- while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_funcall(rb_o1, id_get, 1, INT2FIX(l1-1)), rb_funcall(rb_o2, id_get, 1, INT2FIX(l2-1))))) {
40
- l1--;
41
- l2--;
42
- }
43
-
44
- l1 -= offset;
45
- l2 -= offset;
46
-
47
- /* The Levenshtein algorithm itself. */
48
-
49
- /* s1= */
50
- /* ERIK */
51
- /* */
52
- /* 01234 */
53
- /* s2=V 11234 */
54
- /* E 21234 */
55
- /* E 32234 */
56
- /* N 43334 <- prev_row */
57
- /* S 54444 <- curr_row */
58
- /* T 65555 */
59
- /* R 76566 */
60
- /* A 87667 */
61
-
62
- /* Allocate memory for both rows */
63
-
64
- prev_row = (int*) ALLOC_N(int, (l1+1));
65
- curr_row = (int*) ALLOC_N(int, (l1+1));
66
-
67
- /* Initialize the current row. */
68
-
69
- for (col=0; col<=l1; col++) {
70
- curr_row[col] = col;
71
- }
72
-
73
- for (row=1; row<=l2; row++) {
74
- /* Copy the current row to the previous row. */
75
-
76
- temp_row = prev_row;
77
- prev_row = curr_row;
78
- curr_row = temp_row;
79
-
80
- /* Calculate the values of the current row. */
81
-
82
- curr_row[0] = row;
83
- curr_row_min = row;
84
-
85
- for (col=1; col<=l1; col++) {
86
- /* Equal (cost=0) or substitution (cost=1). */
87
-
88
- value1 = prev_row[col-1] + (RTEST(rb_equal(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset+col-1)), rb_funcall(rb_o2, id_get, 1, INT2FIX(offset+row-1)))) ? 0 : 1);
89
-
90
- /* Insertion if it's cheaper than substitution. */
91
-
92
- value2 = prev_row[col]+1;
93
- if (value2 < value1) {
94
- value1 = value2;
95
- }
96
-
97
- /* Deletion if it's cheaper than substitution. */
98
-
99
- value2 = curr_row[col-1]+1;
100
- if (value2 < value1) {
101
- value1 = value2;
102
- }
103
-
104
- /* Keep track of the minimum value on this row. */
105
-
106
- if (value1 < curr_row_min) {
107
- curr_row_min = value1;
108
- }
109
-
110
- curr_row[col] = value1;
111
- }
112
-
113
- /* Return nil as soon as we exceed the threshold. */
114
-
115
- if (threshold > -1 && curr_row_min >= threshold) {
116
- free(prev_row);
117
- free(curr_row);
118
-
119
- return Qnil;
120
- }
121
- }
122
-
123
- /* The result is the last value on the last row. */
124
-
125
- result = curr_row[l1];
126
-
127
- free(prev_row);
128
- free(curr_row);
129
-
130
- /* Return the Ruby version of the result. */
131
-
132
- return INT2FIX(result);
133
- }
@@ -1,138 +0,0 @@
1
- #include "ruby.h"
2
- #include "levenshtein.h"
3
-
4
- VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
5
- int threshold;
6
- int l1, l2;
7
- int *prev_row, *curr_row, *temp_row;
8
- int col, row;
9
- int curr_row_min, result;
10
- int offset;
11
- int value1, value2;
12
- char *s1, *s2;
13
-
14
- /* Convert Ruby's s1 to C's s1. */
15
-
16
- rb_o1 = StringValue(rb_o1);
17
- s1 = RSTRING_PTR(rb_o1);
18
- l1 = RSTRING_LEN(rb_o1);
19
-
20
- /* Convert Ruby's s2 to C's s2. */
21
-
22
- rb_o2 = StringValue(rb_o2);
23
- s2 = RSTRING_PTR(rb_o2);
24
- l2 = RSTRING_LEN(rb_o2);
25
-
26
- /* Convert Ruby's threshold to C's threshold. */
27
-
28
- if (!NIL_P(rb_threshold)) {
29
- threshold = FIX2INT(rb_threshold);
30
- } else {
31
- threshold = -1;
32
- }
33
-
34
- /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
35
-
36
- offset = 0;
37
-
38
- while ((offset < l1) && (offset < l2) && (s1[offset] == s2[offset])) {
39
- offset++;
40
- }
41
-
42
- /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
43
-
44
- while ((offset < l1) && (offset < l2) && (s1[l1-1] == s2[l2-1])) {
45
- l1--;
46
- l2--;
47
- }
48
-
49
- l1 -= offset;
50
- l2 -= offset;
51
-
52
- /* The Levenshtein algorithm itself. */
53
-
54
- /* s1= */
55
- /* ERIK */
56
- /* */
57
- /* 01234 */
58
- /* s2=V 11234 */
59
- /* E 21234 */
60
- /* E 32234 */
61
- /* N 43334 <- prev_row */
62
- /* S 54444 <- curr_row */
63
- /* T 65555 */
64
- /* R 76566 */
65
- /* A 87667 */
66
-
67
- /* Allocate memory for both rows */
68
-
69
- prev_row = (int*) ALLOC_N(int, (l1+1));
70
- curr_row = (int*) ALLOC_N(int, (l1+1));
71
-
72
- /* Initialize the current row. */
73
-
74
- for (col=0; col<=l1; col++) {
75
- curr_row[col] = col;
76
- }
77
-
78
- for (row=1; row<=l2; row++) {
79
- /* Copy the current row to the previous row. */
80
-
81
- temp_row = prev_row;
82
- prev_row = curr_row;
83
- curr_row = temp_row;
84
-
85
- /* Calculate the values of the current row. */
86
-
87
- curr_row[0] = row;
88
- curr_row_min = row;
89
-
90
- for (col=1; col<=l1; col++) {
91
- /* Equal (cost=0) or substitution (cost=1). */
92
-
93
- value1 = prev_row[col-1] + ((s1[offset+col-1] == s2[offset+row-1]) ? 0 : 1);
94
-
95
- /* Insertion if it's cheaper than substitution. */
96
-
97
- value2 = prev_row[col]+1;
98
- if (value2 < value1) {
99
- value1 = value2;
100
- }
101
-
102
- /* Deletion if it's cheaper than substitution. */
103
-
104
- value2 = curr_row[col-1]+1;
105
- if (value2 < value1) {
106
- value1 = value2;
107
- }
108
-
109
- /* Keep track of the minimum value on this row. */
110
-
111
- if (value1 < curr_row_min) {
112
- curr_row_min = value1;
113
- }
114
-
115
- curr_row[col] = value1;
116
- }
117
-
118
- /* Return nil as soon as we exceed the threshold. */
119
-
120
- if (threshold > -1 && curr_row_min >= threshold) {
121
- free(prev_row);
122
- free(curr_row);
123
-
124
- return Qnil;
125
- }
126
- }
127
-
128
- /* The result is the last value on the last row. */
129
-
130
- result = curr_row[l1];
131
-
132
- free(prev_row);
133
- free(curr_row);
134
-
135
- /* Return the Ruby version of the result. */
136
-
137
- return INT2FIX(result);
138
- }
@@ -1,4 +0,0 @@
1
- module Levenshtein
2
- class LevenshteinException < RuntimeError
3
- end
4
- end