levenshtein 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,4 +1,8 @@
1
- 0.2.1 (11-02-2012)
1
+ 0.2.2 (16-03-2012)
2
+
3
+ * Simplified code.
4
+
5
+ 0.2.1 (11-03-2012)
2
6
 
3
7
  * Better memory handling.
4
8
 
data/README CHANGED
@@ -1,12 +1,15 @@
1
- The Levenshtein distance is a metric for measuring the amount of difference
2
- between two sequences (i.e., the so called edit distance). The Levenshtein
3
- distance between two sequences is given by the minimum number of operations
4
- needed to transform one sequence into the other, where an operation is an
1
+ The Levenshtein distance is a metric for measuring the amount
2
+ of difference between two sequences (i.e., the so called edit
3
+ distance). The Levenshtein distance between two sequences is
4
+ given by the minimum number of operations needed to transform
5
+ one sequence into the other, where an operation is an
5
6
  insertion, deletion, or substitution of a single element.
6
7
 
7
- The two sequences can be two strings, two arrays, or two other objects.
8
- Strings, arrays and arrays of strings are handled with optimized (very fast) C
9
- code. All other sequences are handled with generic (fast) C code.
8
+ The two sequences can be two strings, two arrays, or two other
9
+ objects responding to :each. All sequences are by generic
10
+ (fast) C code.
11
+
12
+ All objects in the sequences should respond to :hash and :eql?.
10
13
 
11
14
  More information about the Levenshtein distance algorithm:
12
15
  http://en.wikipedia.org/wiki/Levenshtein_distance .
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.1
1
+ 0.2.2
@@ -2,17 +2,121 @@
2
2
  #include "levenshtein.h"
3
3
 
4
4
  VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
5
- if ((TYPE(rb_o1) == T_STRING) && (TYPE(rb_o2)) == T_STRING) {
6
- return levenshtein_distance_string(self, rb_o1, rb_o2, rb_threshold);
7
- } else if ((TYPE(rb_o1) == T_ARRAY) && (TYPE(rb_o2)) == T_ARRAY) {
8
- if ((TYPE(rb_ary_entry(rb_o1, 0)) == T_STRING) && (TYPE(rb_ary_entry(rb_o2, 0))) == T_STRING) {
9
- return levenshtein_distance_array_of_strings(self, rb_o1, rb_o2, rb_threshold);
10
- } else {
11
- return levenshtein_distance_array(self, rb_o1, rb_o2, rb_threshold);
12
- }
5
+ VALUE *p1, *p2;
6
+ long l1, l2;
7
+ long col, row;
8
+ int threshold;
9
+ int *prev_row, *curr_row, *temp_row;
10
+ int curr_row_min, result;
11
+ int value1, value2;
12
+
13
+ /* Be sure that all equivalent objects in rb_o1 and rb_o2 (a.eql?(b) == true) are taken from a pool (a.equal?(b) == true). */
14
+ /* This is done in levenshtein.rb by means of Util.pool. */
15
+
16
+ /* Get the sizes of both arrays. */
17
+
18
+ l1 = RARRAY_LEN(rb_o1);
19
+ l2 = RARRAY_LEN(rb_o2);
20
+
21
+ /* Get the pointers of both arrays. */
22
+
23
+ p1 = RARRAY_PTR(rb_o1);
24
+ p2 = RARRAY_PTR(rb_o2);
25
+
26
+ /* Convert Ruby's threshold to C's threshold. */
27
+
28
+ if (!NIL_P(rb_threshold)) {
29
+ threshold = FIX2INT(rb_threshold);
13
30
  } else {
14
- return levenshtein_distance_generic(self, rb_o1, rb_o2, rb_threshold);
31
+ threshold = -1;
32
+ }
33
+
34
+ /* The Levenshtein algorithm itself. */
35
+
36
+ /* s1= */
37
+ /* ERIK */
38
+ /* */
39
+ /* 01234 */
40
+ /* s2=V 11234 */
41
+ /* E 21234 */
42
+ /* E 32234 */
43
+ /* N 43334 <- prev_row */
44
+ /* S 54444 <- curr_row */
45
+ /* T 65555 */
46
+ /* R 76566 */
47
+ /* A 87667 */
48
+
49
+ /* Allocate memory for both rows */
50
+
51
+ prev_row = (int*) ALLOC_N(int, (l1+1));
52
+ curr_row = (int*) ALLOC_N(int, (l1+1));
53
+
54
+ /* Initialize the current row. */
55
+
56
+ for (col=0; col<=l1; col++) {
57
+ curr_row[col] = col;
15
58
  }
59
+
60
+ for (row=1; row<=l2; row++) {
61
+ /* Copy the current row to the previous row. */
62
+
63
+ temp_row = prev_row;
64
+ prev_row = curr_row;
65
+ curr_row = temp_row;
66
+
67
+ /* Calculate the values of the current row. */
68
+
69
+ curr_row[0] = row;
70
+ curr_row_min = row;
71
+
72
+ for (col=1; col<=l1; col++) {
73
+ /* Equal (cost=0) or substitution (cost=1). */
74
+
75
+ value1 = prev_row[col-1] + ((p1[col-1] == p2[row-1]) ? 0 : 1);
76
+
77
+ /* Insertion if it's cheaper than substitution. */
78
+
79
+ value2 = prev_row[col]+1;
80
+ if (value2 < value1) {
81
+ value1 = value2;
82
+ }
83
+
84
+ /* Deletion if it's cheaper than substitution. */
85
+
86
+ value2 = curr_row[col-1]+1;
87
+ if (value2 < value1) {
88
+ value1 = value2;
89
+ }
90
+
91
+ /* Keep track of the minimum value on this row. */
92
+
93
+ if (value1 < curr_row_min) {
94
+ curr_row_min = value1;
95
+ }
96
+
97
+ curr_row[col] = value1;
98
+ }
99
+
100
+ /* Return nil as soon as we exceed the threshold. */
101
+
102
+ if (threshold > -1 && curr_row_min >= threshold) {
103
+ free(prev_row);
104
+ free(curr_row);
105
+
106
+ return Qnil;
107
+ }
108
+ }
109
+
110
+ /* The result is the last value on the last row. */
111
+
112
+ result = curr_row[l1];
113
+
114
+ free(prev_row);
115
+ free(curr_row);
116
+
117
+ /* Return the Ruby version of the result. */
118
+
119
+ return INT2FIX(result);
16
120
  }
17
121
 
18
122
  void Init_levenshtein_fast() {
@@ -1,25 +1,30 @@
1
- require "levenshtein/exception"
1
+ # encoding: UTF-8
2
+
2
3
  require "levenshtein/version"
3
4
 
4
5
  module Levenshtein
5
6
  # Returns the Levenshtein distance as a number between 0.0 and
6
7
  # 1.0. It's basically the Levenshtein distance divided by the
7
- # length of the longest sequence.
8
+ # size of the longest sequence.
8
9
 
9
- def self.normalized_distance(a1, a2, threshold=nil)
10
- a1, a2 = a2, a1 if a1.length > a2.length # a1 is the short one; a2 is the long one.
10
+ def self.normalized_distance(a1, a2, threshold=nil, options={})
11
+ size = [a1.size, a2.size].max
11
12
 
12
- if a2.length == 0
13
- 0.0 # Since a1.length < a2.length, a1 must be empty as well.
13
+ if a1.size == 0 and a2.size == 0
14
+ 0.0
15
+ elsif a1.size == 0
16
+ a2.size.to_f/size
17
+ elsif a2.size == 0
18
+ a1.size.to_f/size
14
19
  else
15
20
  if threshold
16
- if d = self.distance(a1, a2, (threshold*a2.length+1).to_i)
17
- d.to_f/a2.length
21
+ if d = self.distance(a1, a2, (threshold*size).to_i+1)
22
+ d.to_f/size
18
23
  else
19
24
  nil
20
25
  end
21
26
  else
22
- self.distance(a1, a2).to_f/a2.length
27
+ self.distance(a1, a2).to_f/size
23
28
  end
24
29
  end
25
30
  end
@@ -27,71 +32,79 @@ module Levenshtein
27
32
  # Returns the Levenshtein distance between two sequences.
28
33
  #
29
34
  # The two sequences can be two strings, two arrays, or two other
30
- # objects. Strings, arrays and arrays of strings are handled with
31
- # optimized (very fast) C code. All other sequences are handled
32
- # with generic (fast) C code.
35
+ # objects responding to :each. All sequences are by generic
36
+ # (fast) C code.
33
37
  #
34
- # The sequences should respond to :length and :[] and all objects
35
- # in the sequences (as returned by []) should response to :==.
38
+ # All objects in the sequences should respond to :hash and :eql?.
36
39
 
37
- def self.distance(a1, a2, threshold=nil)
38
- a1, a2 = a2, a1 if a1.length > a2.length # a1 is the short one; a2 is the long one.
40
+ def self.distance(a1, a2, threshold=nil, options={})
41
+ a1, a2 = a1.scan(/./), a2.scan(/./) if String === a1 and String === a2
42
+ a1, a2 = Util.pool(a1, a2)
39
43
 
40
44
  # Handle some basic circumstances.
41
45
 
42
46
  return 0 if a1 == a2
43
- return a2.length if a1.length == 0
47
+ return a2.size if a1.empty?
48
+ return a1.size if a2.empty?
44
49
 
45
50
  if threshold
46
- return nil if (a2.length-a1.length) >= threshold
47
-
48
- a3, a4 = nil, nil
49
- a3, a4 = a1, a2 if a1.respond_to?(:-) and a2.respond_to?(:-)
50
- a3, a4 = a1.scan(/./), a2.scan(/./) if a1.respond_to?(:scan) and a2.respond_to?(:scan)
51
-
52
- if a3 and a4
53
- return nil if (a3-a4).length >= threshold
54
- return nil if (a4-a3).length >= threshold
55
- end
51
+ return nil if (a1.size-a2.size) >= threshold
52
+ return nil if (a2.size-a1.size) >= threshold
53
+ return nil if (a1-a2).size >= threshold
54
+ return nil if (a2-a1).size >= threshold
56
55
  end
57
56
 
58
- distance_fast_or_slow(a1, a2, threshold)
59
- end
60
-
61
- def self.distance_fast_or_slow(a1, a2, threshold) # :nodoc:
62
- if respond_to?(:distance_fast)
63
- distance_fast(a1, a2, threshold) # Implemented in C.
64
- else
65
- distance_slow(a1, a2, threshold) # Implemented in Ruby.
66
- end
67
- end
57
+ # Remove the common prefix and the common postfix.
68
58
 
69
- def self.distance_slow(a1, a2, threshold) # :nodoc:
70
- l1 = a1.length
71
- l2 = a2.length
59
+ l1 = a1.size
60
+ l2 = a2.size
72
61
 
73
- offset = 0
62
+ offset = 0
63
+ no_more_optimizations = true
74
64
 
75
- while offset < l1 and offset < l2 and a1[offset] == a2[offset]
65
+ while offset < l1 and offset < l2 and a1[offset].equal?(a2[offset])
76
66
  offset += 1
67
+
68
+ no_more_optimizations = false
77
69
  end
78
70
 
79
- while offset < l1 and offset < l2 and a1[l1-1] == a2[l2-1]
71
+ while offset < l1 and offset < l2 and a1[l1-1].equal?(a2[l2-1])
80
72
  l1 -= 1
81
73
  l2 -= 1
74
+
75
+ no_more_optimizations = false
82
76
  end
83
77
 
84
- l1 -= offset
85
- l2 -= offset
78
+ if no_more_optimizations
79
+ distance_fast_or_slow(a1, a2, threshold, options)
80
+ else
81
+ l1 -= offset
82
+ l2 -= offset
83
+
84
+ a1 = a1[offset, l1]
85
+ a2 = a2[offset, l2]
86
86
 
87
- crow = (0..l1).to_a
87
+ distance(a1, a2, threshold, options)
88
+ end
89
+ end
88
90
 
89
- 1.upto(l2) do |y|
91
+ def self.distance_fast_or_slow(a1, a2, threshold, options) # :nodoc:
92
+ if respond_to?(:distance_fast) and options[:force_slow]
93
+ distance_fast(a1, a2, threshold) # Implemented in C.
94
+ else
95
+ distance_slow(a1, a2, threshold) # Implemented in Ruby.
96
+ end
97
+ end
98
+
99
+ def self.distance_slow(a1, a2, threshold) # :nodoc:
100
+ crow = (0..a1.size).to_a
101
+
102
+ 1.upto(a2.size) do |y|
90
103
  prow = crow
91
104
  crow = [y]
92
105
 
93
- 1.upto(l1) do |x|
94
- crow[x] = [prow[x]+1, crow[x-1]+1, prow[x-1]+(a1[offset+x-1]==a2[offset+y-1] ? 0 : 1)].min
106
+ 1.upto(a1.size) do |x|
107
+ crow[x] = [prow[x]+1, crow[x-1]+1, prow[x-1]+(a1[x-1].equal?(a2[y-1]) ? 0 : 1)].min
95
108
  end
96
109
 
97
110
  # Stop analysing this sequence as soon as the best possible
@@ -104,6 +117,24 @@ module Levenshtein
104
117
 
105
118
  crow[-1]
106
119
  end
120
+
121
+ module Util # :nodoc:
122
+ def self.pool(*args)
123
+ # So we can compare pointers instead of objects (equal?() instead of ==()).
124
+
125
+ pool = {}
126
+
127
+ args.collect do |arg|
128
+ a = []
129
+
130
+ arg.each do |o|
131
+ a << pool[o] ||= o
132
+ end
133
+
134
+ a
135
+ end
136
+ end
137
+ end
107
138
  end
108
139
 
109
140
  begin
@@ -1,3 +1,5 @@
1
+ # encoding: UTF-8
2
+
1
3
  module Levenshtein
2
- VERSION = "0.2.1"
4
+ VERSION = "0.2.2"
3
5
  end
@@ -1,3 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+
1
4
  require "test/unit"
2
5
  require "levenshtein"
3
6
 
@@ -7,14 +10,10 @@ module Levenshtein
7
10
  @sequence = o
8
11
  end
9
12
 
10
- def length
11
- @sequence.length
12
- end
13
-
14
- def [](pos)
15
- raise "type not allowed [#{pos.inspect}]" unless pos.kind_of?(Fixnum)
16
-
17
- @sequence[pos]
13
+ def each
14
+ @sequence.length.times do |pos|
15
+ yield(@sequence[pos])
16
+ end
18
17
  end
19
18
  end
20
19
 
@@ -24,14 +23,18 @@ module Levenshtein
24
23
  def initialize(o)
25
24
  @object = o
26
25
  end
27
-
28
- def ==(other)
29
- @object == other.object
26
+
27
+ def hash
28
+ @object.hash
29
+ end
30
+
31
+ def eql?(other)
32
+ @object.eql?(other.object)
30
33
  end
31
34
  end
32
35
  end
33
36
 
34
- class TestLevenshteinString < Test::Unit::TestCase
37
+ class TestLevenshtein < Test::Unit::TestCase
35
38
  def test_erik_veenstra
36
39
  assert_equal(7, Levenshtein.distance("erik", "veenstra"))
37
40
  assert_equal(7, Levenshtein.distance("veenstra", "erik"))
@@ -79,59 +82,73 @@ class TestLevenshteinString < Test::Unit::TestCase
79
82
  assert_in_delta(0.6, Levenshtein.normalized_distance("123cd", "xyzcd"), 0.01)
80
83
  assert_in_delta(0.625, Levenshtein.normalized_distance("123cd123", "123"), 0.01)
81
84
  end
82
- end
83
85
 
84
- class TestLevenshteinArray < Test::Unit::TestCase
85
- def test_erik_veenstra
86
- x = lambda{|s| s.scan(/./).collect{|e| Levenshtein::TestElement.new(e)}}
86
+ def test_interface
87
+ seq1 = Levenshtein::TestSequence.new("erik".scan(/./).collect{|e| Levenshtein::TestElement.new(e)})
88
+ seq2 = Levenshtein::TestSequence.new("veenstra".scan(/./).collect{|e| Levenshtein::TestElement.new(e)})
87
89
 
88
- assert_equal(7, Levenshtein.distance(x["erik"], x["veenstra"]))
90
+ assert_equal(7, Levenshtein.distance(seq1, seq2))
89
91
  end
90
92
  end
91
93
 
92
- class TestLevenshteinArrayOfStrings < Test::Unit::TestCase
94
+ class TestLevenshteinFast < Test::Unit::TestCase
93
95
  def test_erik_veenstra
94
- x = lambda{|s| s.scan(/./)}
96
+ assert_equal(7, Levenshtein.distance("erik", "veenstra", nil, :force_slow=>false))
97
+ assert_equal(7, Levenshtein.distance("veenstra", "erik", nil, :force_slow=>false))
98
+ end
95
99
 
96
- assert_equal(7, Levenshtein.distance(x["erik"], x["veenstra"]))
100
+ def test_empty_string
101
+ assert_equal(0, Levenshtein.distance("", "", nil, :force_slow=>false))
102
+ assert_equal(3, Levenshtein.distance("", "foo", nil, :force_slow=>false))
103
+ assert_equal(3, Levenshtein.distance("foo", "", nil, :force_slow=>false))
97
104
  end
98
- end
99
105
 
100
- class TestLevenshteinGeneric < Test::Unit::TestCase
101
- def test_erik_veenstra
102
- x = lambda{|s| Levenshtein::TestSequence.new(s.scan(/./).collect{|e| Levenshtein::TestElement.new(e)})}
106
+ def test_same_string
107
+ assert_equal(0, Levenshtein.distance("", "", nil, :force_slow=>false))
108
+ assert_equal(0, Levenshtein.distance("foo", "foo", nil, :force_slow=>false))
109
+ end
103
110
 
104
- assert_equal(7, Levenshtein.distance(x["erik"], x["veenstra"]))
111
+ def test_threshold
112
+ assert_equal(3, Levenshtein.distance("foo", "foobar", nil, :force_slow=>false))
113
+ assert_equal(3, Levenshtein.distance("foo", "foobar", 4, :force_slow=>false))
114
+ assert_equal(nil, Levenshtein.distance("foo", "foobar", 2, :force_slow=>false))
115
+ end
116
+
117
+ def test_same_head_and_or_tail
118
+ assert_equal(3, Levenshtein.distance("ab123cd", "abxyzcd", nil, :force_slow=>false))
119
+ assert_equal(3, Levenshtein.distance("ab123", "abxyz", nil, :force_slow=>false))
120
+ assert_equal(3, Levenshtein.distance("123cd", "xyzcd", nil, :force_slow=>false))
121
+ assert_equal(5, Levenshtein.distance("123cd123", "123", nil, :force_slow=>false))
105
122
  end
106
123
  end
107
124
 
108
125
  class TestLevenshteinSlow < Test::Unit::TestCase
109
126
  def test_erik_veenstra
110
- assert_equal(7, Levenshtein.distance_slow("erik", "veenstra", nil))
111
- assert_equal(7, Levenshtein.distance_slow("veenstra", "erik", nil))
127
+ assert_equal(7, Levenshtein.distance("erik", "veenstra", nil, :force_slow=>true))
128
+ assert_equal(7, Levenshtein.distance("veenstra", "erik", nil, :force_slow=>true))
112
129
  end
113
130
 
114
131
  def test_empty_string
115
- assert_equal(0, Levenshtein.distance_slow("", "", nil))
116
- assert_equal(3, Levenshtein.distance_slow("", "foo", nil))
117
- assert_equal(3, Levenshtein.distance_slow("foo", "", nil))
132
+ assert_equal(0, Levenshtein.distance("", "", nil, :force_slow=>true))
133
+ assert_equal(3, Levenshtein.distance("", "foo", nil, :force_slow=>true))
134
+ assert_equal(3, Levenshtein.distance("foo", "", nil, :force_slow=>true))
118
135
  end
119
136
 
120
137
  def test_same_string
121
- assert_equal(0, Levenshtein.distance_slow("", "", nil))
122
- assert_equal(0, Levenshtein.distance_slow("foo", "foo", nil))
138
+ assert_equal(0, Levenshtein.distance("", "", nil, :force_slow=>true))
139
+ assert_equal(0, Levenshtein.distance("foo", "foo", nil, :force_slow=>true))
123
140
  end
124
141
 
125
142
  def test_threshold
126
- assert_equal(3, Levenshtein.distance_slow("foo", "foobar", nil))
127
- assert_equal(3, Levenshtein.distance_slow("foo", "foobar", 4))
128
- assert_equal(nil, Levenshtein.distance_slow("foo", "foobar", 2))
143
+ assert_equal(3, Levenshtein.distance("foo", "foobar", nil, :force_slow=>true))
144
+ assert_equal(3, Levenshtein.distance("foo", "foobar", 4, :force_slow=>true))
145
+ assert_equal(nil, Levenshtein.distance("foo", "foobar", 2, :force_slow=>true))
129
146
  end
130
147
 
131
148
  def test_same_head_and_or_tail
132
- assert_equal(3, Levenshtein.distance_slow("ab123cd", "abxyzcd", nil))
133
- assert_equal(3, Levenshtein.distance_slow("ab123", "abxyz", nil))
134
- assert_equal(3, Levenshtein.distance_slow("123cd", "xyzcd", nil))
135
- assert_equal(5, Levenshtein.distance_slow("123cd123", "123", nil))
149
+ assert_equal(3, Levenshtein.distance("ab123cd", "abxyzcd", nil, :force_slow=>true))
150
+ assert_equal(3, Levenshtein.distance("ab123", "abxyz", nil, :force_slow=>true))
151
+ assert_equal(3, Levenshtein.distance("123cd", "xyzcd", nil, :force_slow=>true))
152
+ assert_equal(5, Levenshtein.distance("123cd123", "123", nil, :force_slow=>true))
136
153
  end
137
154
  end
metadata CHANGED
@@ -1,41 +1,27 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: levenshtein
3
- version: !ruby/object:Gem::Version
4
- hash: 21
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.2
5
5
  prerelease:
6
- segments:
7
- - 0
8
- - 2
9
- - 1
10
- version: 0.2.1
11
6
  platform: ruby
12
- authors:
7
+ authors:
13
8
  - Erik Veenstra
14
9
  autorequire:
15
10
  bindir: bin
16
11
  cert_chain: []
17
-
18
- date: 2012-02-11 00:00:00 Z
12
+ date: 2012-03-16 00:00:00.000000000 Z
19
13
  dependencies: []
20
-
21
14
  description: Calculates the Levenshtein distance between two byte strings.
22
15
  email: levenshtein@erikveen.dds.nl
23
16
  executables: []
24
-
25
- extensions:
17
+ extensions:
26
18
  - ext/levenshtein/extconf.rb
27
19
  extra_rdoc_files: []
28
-
29
- files:
30
- - lib/levenshtein/exception.rb
20
+ files:
31
21
  - lib/levenshtein/version.rb
32
22
  - lib/levenshtein.rb
33
- - ext/levenshtein/levenshtein_string.c
34
- - ext/levenshtein/levenshtein_generic.c
35
23
  - ext/levenshtein/levenshtein.h
36
24
  - ext/levenshtein/levenshtein_fast.c
37
- - ext/levenshtein/levenshtein_array_of_strings.c
38
- - ext/levenshtein/levenshtein_array.c
39
25
  - ext/levenshtein/extconf.rb
40
26
  - README
41
27
  - LICENSE
@@ -44,43 +30,35 @@ files:
44
30
  - test/test.rb
45
31
  homepage: http://www.erikveen.dds.nl/levenshtein/index.html
46
32
  licenses: []
47
-
48
33
  post_install_message:
49
- rdoc_options:
34
+ rdoc_options:
50
35
  - README
51
36
  - LICENSE
52
37
  - VERSION
53
38
  - CHANGELOG
54
39
  - --title
55
- - levenshtein (0.2.1)
40
+ - levenshtein (0.2.2)
56
41
  - --main
57
42
  - README
58
- require_paths:
43
+ require_paths:
59
44
  - lib
60
- required_ruby_version: !ruby/object:Gem::Requirement
45
+ required_ruby_version: !ruby/object:Gem::Requirement
61
46
  none: false
62
- requirements:
63
- - - ">="
64
- - !ruby/object:Gem::Version
65
- hash: 3
66
- segments:
67
- - 0
68
- version: "0"
69
- required_rubygems_version: !ruby/object:Gem::Requirement
47
+ requirements:
48
+ - - ! '>='
49
+ - !ruby/object:Gem::Version
50
+ version: '0'
51
+ required_rubygems_version: !ruby/object:Gem::Requirement
70
52
  none: false
71
- requirements:
72
- - - ">="
73
- - !ruby/object:Gem::Version
74
- hash: 3
75
- segments:
76
- - 0
77
- version: "0"
53
+ requirements:
54
+ - - ! '>='
55
+ - !ruby/object:Gem::Version
56
+ version: '0'
78
57
  requirements: []
79
-
80
58
  rubyforge_project: levenshtein
81
- rubygems_version: 1.8.12
59
+ rubygems_version: 1.8.18
82
60
  signing_key:
83
61
  specification_version: 3
84
62
  summary: Calculates the Levenshtein distance between two byte strings.
85
- test_files:
63
+ test_files:
86
64
  - test/test.rb
@@ -1,130 +0,0 @@
1
- #include "ruby.h"
2
- #include "levenshtein.h"
3
-
4
- VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
5
- int threshold;
6
- int l1, l2;
7
- int *prev_row, *curr_row, *temp_row;
8
- int col, row;
9
- int curr_row_min, result;
10
- int offset;
11
- int value1, value2;
12
-
13
- /* Get the sizes of both arrays. */
14
-
15
- l1 = RARRAY_LEN(rb_o1);
16
- l2 = RARRAY_LEN(rb_o2);
17
-
18
- /* Convert Ruby's threshold to C's threshold. */
19
-
20
- if (!NIL_P(rb_threshold)) {
21
- threshold = FIX2INT(rb_threshold);
22
- } else {
23
- threshold = -1;
24
- }
25
-
26
- /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
27
-
28
- offset = 0;
29
-
30
- while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)))) {
31
- offset++;
32
- }
33
-
34
- /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
35
-
36
- while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)))) {
37
- l1--;
38
- l2--;
39
- }
40
-
41
- l1 -= offset;
42
- l2 -= offset;
43
-
44
- /* The Levenshtein algorithm itself. */
45
-
46
- /* s1= */
47
- /* ERIK */
48
- /* */
49
- /* 01234 */
50
- /* s2=V 11234 */
51
- /* E 21234 */
52
- /* E 32234 */
53
- /* N 43334 <- prev_row */
54
- /* S 54444 <- curr_row */
55
- /* T 65555 */
56
- /* R 76566 */
57
- /* A 87667 */
58
-
59
- /* Allocate memory for both rows */
60
-
61
- prev_row = (int*) ALLOC_N(int, (l1+1));
62
- curr_row = (int*) ALLOC_N(int, (l1+1));
63
-
64
- /* Initialize the current row. */
65
-
66
- for (col=0; col<=l1; col++) {
67
- curr_row[col] = col;
68
- }
69
-
70
- for (row=1; row<=l2; row++) {
71
- /* Copy the current row to the previous row. */
72
-
73
- temp_row = prev_row;
74
- prev_row = curr_row;
75
- curr_row = temp_row;
76
-
77
- /* Calculate the values of the current row. */
78
-
79
- curr_row[0] = row;
80
- curr_row_min = row;
81
-
82
- for (col=1; col<=l1; col++) {
83
- /* Equal (cost=0) or substitution (cost=1). */
84
-
85
- value1 = prev_row[col-1] + (RTEST(rb_equal(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1))) ? 0 : 1);
86
-
87
- /* Insertion if it's cheaper than substitution. */
88
-
89
- value2 = prev_row[col]+1;
90
- if (value2 < value1) {
91
- value1 = value2;
92
- }
93
-
94
- /* Deletion if it's cheaper than substitution. */
95
-
96
- value2 = curr_row[col-1]+1;
97
- if (value2 < value1) {
98
- value1 = value2;
99
- }
100
-
101
- /* Keep track of the minimum value on this row. */
102
-
103
- if (value1 < curr_row_min) {
104
- curr_row_min = value1;
105
- }
106
-
107
- curr_row[col] = value1;
108
- }
109
-
110
- /* Return nil as soon as we exceed the threshold. */
111
-
112
- if (threshold > -1 && curr_row_min >= threshold) {
113
- free(prev_row);
114
- free(curr_row);
115
-
116
- return Qnil;
117
- }
118
- }
119
-
120
- /* The result is the last value on the last row. */
121
-
122
- result = curr_row[l1];
123
-
124
- free(prev_row);
125
- free(curr_row);
126
-
127
- /* Return the Ruby version of the result. */
128
-
129
- return INT2FIX(result);
130
- }
@@ -1,130 +0,0 @@
1
- #include "ruby.h"
2
- #include "levenshtein.h"
3
-
4
- VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
5
- int threshold;
6
- int l1, l2;
7
- int *prev_row, *curr_row, *temp_row;
8
- int col, row;
9
- int curr_row_min, result;
10
- int offset;
11
- int value1, value2;
12
-
13
- /* Get the sizes of both arrays. */
14
-
15
- l1 = RARRAY_LEN(rb_o1);
16
- l2 = RARRAY_LEN(rb_o2);
17
-
18
- /* Convert Ruby's threshold to C's threshold. */
19
-
20
- if (!NIL_P(rb_threshold)) {
21
- threshold = FIX2INT(rb_threshold);
22
- } else {
23
- threshold = -1;
24
- }
25
-
26
- /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
27
-
28
- offset = 0;
29
-
30
- while ((offset < l1) && (offset < l2) && (rb_str_cmp(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)) == 0)) {
31
- offset++;
32
- }
33
-
34
- /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
35
-
36
- while ((offset < l1) && (offset < l2) && (rb_str_cmp(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)) == 0 )) {
37
- l1--;
38
- l2--;
39
- }
40
-
41
- l1 -= offset;
42
- l2 -= offset;
43
-
44
- /* The Levenshtein algorithm itself. */
45
-
46
- /* s1= */
47
- /* ERIK */
48
- /* */
49
- /* 01234 */
50
- /* s2=V 11234 */
51
- /* E 21234 */
52
- /* E 32234 */
53
- /* N 43334 <- prev_row */
54
- /* S 54444 <- curr_row */
55
- /* T 65555 */
56
- /* R 76566 */
57
- /* A 87667 */
58
-
59
- /* Allocate memory for both rows */
60
-
61
- prev_row = (int*) ALLOC_N(int, (l1+1));
62
- curr_row = (int*) ALLOC_N(int, (l1+1));
63
-
64
- /* Initialize the current row. */
65
-
66
- for (col=0; col<=l1; col++) {
67
- curr_row[col] = col;
68
- }
69
-
70
- for (row=1; row<=l2; row++) {
71
- /* Copy the current row to the previous row. */
72
-
73
- temp_row = prev_row;
74
- prev_row = curr_row;
75
- curr_row = temp_row;
76
-
77
- /* Calculate the values of the current row. */
78
-
79
- curr_row[0] = row;
80
- curr_row_min = row;
81
-
82
- for (col=1; col<=l1; col++) {
83
- /* Equal (cost=0) or substitution (cost=1). */
84
-
85
- value1 = prev_row[col-1] + ((rb_str_cmp(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1)) == 0) ? 0 : 1);
86
-
87
- /* Insertion if it's cheaper than substitution. */
88
-
89
- value2 = prev_row[col]+1;
90
- if (value2 < value1) {
91
- value1 = value2;
92
- }
93
-
94
- /* Deletion if it's cheaper than substitution. */
95
-
96
- value2 = curr_row[col-1]+1;
97
- if (value2 < value1) {
98
- value1 = value2;
99
- }
100
-
101
- /* Keep track of the minimum value on this row. */
102
-
103
- if (value1 < curr_row_min) {
104
- curr_row_min = value1;
105
- }
106
-
107
- curr_row[col] = value1;
108
- }
109
-
110
- /* Return nil as soon as we exceed the threshold. */
111
-
112
- if (threshold > -1 && curr_row_min >= threshold) {
113
- free(prev_row);
114
- free(curr_row);
115
-
116
- return Qnil;
117
- }
118
- }
119
-
120
- /* The result is the last value on the last row. */
121
-
122
- result = curr_row[l1];
123
-
124
- free(prev_row);
125
- free(curr_row);
126
-
127
- /* Return the Ruby version of the result. */
128
-
129
- return INT2FIX(result);
130
- }
@@ -1,133 +0,0 @@
1
- #include "ruby.h"
2
- #include "levenshtein.h"
3
-
4
- VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
5
- int threshold;
6
- int l1, l2;
7
- int *prev_row, *curr_row, *temp_row;
8
- int col, row;
9
- int curr_row_min, result;
10
- int offset;
11
- int value1, value2;
12
-
13
- ID id_length = rb_intern("length");
14
- ID id_get = rb_intern("[]");
15
-
16
- /* Get the sizes of both sequences. */
17
-
18
- l1 = FIX2INT(rb_funcall(rb_o1, id_length, 0));
19
- l2 = FIX2INT(rb_funcall(rb_o2, id_length, 0));
20
-
21
- /* Convert Ruby's threshold to C's threshold. */
22
-
23
- if (!NIL_P(rb_threshold)) {
24
- threshold = FIX2INT(rb_threshold);
25
- } else {
26
- threshold = -1;
27
- }
28
-
29
- /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
30
-
31
- offset = 0;
32
-
33
- while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset)), rb_funcall(rb_o2, id_get, 1, INT2FIX(offset))))) {
34
- offset++;
35
- }
36
-
37
- /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
38
-
39
- while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_funcall(rb_o1, id_get, 1, INT2FIX(l1-1)), rb_funcall(rb_o2, id_get, 1, INT2FIX(l2-1))))) {
40
- l1--;
41
- l2--;
42
- }
43
-
44
- l1 -= offset;
45
- l2 -= offset;
46
-
47
- /* The Levenshtein algorithm itself. */
48
-
49
- /* s1= */
50
- /* ERIK */
51
- /* */
52
- /* 01234 */
53
- /* s2=V 11234 */
54
- /* E 21234 */
55
- /* E 32234 */
56
- /* N 43334 <- prev_row */
57
- /* S 54444 <- curr_row */
58
- /* T 65555 */
59
- /* R 76566 */
60
- /* A 87667 */
61
-
62
- /* Allocate memory for both rows */
63
-
64
- prev_row = (int*) ALLOC_N(int, (l1+1));
65
- curr_row = (int*) ALLOC_N(int, (l1+1));
66
-
67
- /* Initialize the current row. */
68
-
69
- for (col=0; col<=l1; col++) {
70
- curr_row[col] = col;
71
- }
72
-
73
- for (row=1; row<=l2; row++) {
74
- /* Copy the current row to the previous row. */
75
-
76
- temp_row = prev_row;
77
- prev_row = curr_row;
78
- curr_row = temp_row;
79
-
80
- /* Calculate the values of the current row. */
81
-
82
- curr_row[0] = row;
83
- curr_row_min = row;
84
-
85
- for (col=1; col<=l1; col++) {
86
- /* Equal (cost=0) or substitution (cost=1). */
87
-
88
- value1 = prev_row[col-1] + (RTEST(rb_equal(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset+col-1)), rb_funcall(rb_o2, id_get, 1, INT2FIX(offset+row-1)))) ? 0 : 1);
89
-
90
- /* Insertion if it's cheaper than substitution. */
91
-
92
- value2 = prev_row[col]+1;
93
- if (value2 < value1) {
94
- value1 = value2;
95
- }
96
-
97
- /* Deletion if it's cheaper than substitution. */
98
-
99
- value2 = curr_row[col-1]+1;
100
- if (value2 < value1) {
101
- value1 = value2;
102
- }
103
-
104
- /* Keep track of the minimum value on this row. */
105
-
106
- if (value1 < curr_row_min) {
107
- curr_row_min = value1;
108
- }
109
-
110
- curr_row[col] = value1;
111
- }
112
-
113
- /* Return nil as soon as we exceed the threshold. */
114
-
115
- if (threshold > -1 && curr_row_min >= threshold) {
116
- free(prev_row);
117
- free(curr_row);
118
-
119
- return Qnil;
120
- }
121
- }
122
-
123
- /* The result is the last value on the last row. */
124
-
125
- result = curr_row[l1];
126
-
127
- free(prev_row);
128
- free(curr_row);
129
-
130
- /* Return the Ruby version of the result. */
131
-
132
- return INT2FIX(result);
133
- }
@@ -1,138 +0,0 @@
1
- #include "ruby.h"
2
- #include "levenshtein.h"
3
-
4
- VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
5
- int threshold;
6
- int l1, l2;
7
- int *prev_row, *curr_row, *temp_row;
8
- int col, row;
9
- int curr_row_min, result;
10
- int offset;
11
- int value1, value2;
12
- char *s1, *s2;
13
-
14
- /* Convert Ruby's s1 to C's s1. */
15
-
16
- rb_o1 = StringValue(rb_o1);
17
- s1 = RSTRING_PTR(rb_o1);
18
- l1 = RSTRING_LEN(rb_o1);
19
-
20
- /* Convert Ruby's s2 to C's s2. */
21
-
22
- rb_o2 = StringValue(rb_o2);
23
- s2 = RSTRING_PTR(rb_o2);
24
- l2 = RSTRING_LEN(rb_o2);
25
-
26
- /* Convert Ruby's threshold to C's threshold. */
27
-
28
- if (!NIL_P(rb_threshold)) {
29
- threshold = FIX2INT(rb_threshold);
30
- } else {
31
- threshold = -1;
32
- }
33
-
34
- /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
35
-
36
- offset = 0;
37
-
38
- while ((offset < l1) && (offset < l2) && (s1[offset] == s2[offset])) {
39
- offset++;
40
- }
41
-
42
- /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
43
-
44
- while ((offset < l1) && (offset < l2) && (s1[l1-1] == s2[l2-1])) {
45
- l1--;
46
- l2--;
47
- }
48
-
49
- l1 -= offset;
50
- l2 -= offset;
51
-
52
- /* The Levenshtein algorithm itself. */
53
-
54
- /* s1= */
55
- /* ERIK */
56
- /* */
57
- /* 01234 */
58
- /* s2=V 11234 */
59
- /* E 21234 */
60
- /* E 32234 */
61
- /* N 43334 <- prev_row */
62
- /* S 54444 <- curr_row */
63
- /* T 65555 */
64
- /* R 76566 */
65
- /* A 87667 */
66
-
67
- /* Allocate memory for both rows */
68
-
69
- prev_row = (int*) ALLOC_N(int, (l1+1));
70
- curr_row = (int*) ALLOC_N(int, (l1+1));
71
-
72
- /* Initialize the current row. */
73
-
74
- for (col=0; col<=l1; col++) {
75
- curr_row[col] = col;
76
- }
77
-
78
- for (row=1; row<=l2; row++) {
79
- /* Copy the current row to the previous row. */
80
-
81
- temp_row = prev_row;
82
- prev_row = curr_row;
83
- curr_row = temp_row;
84
-
85
- /* Calculate the values of the current row. */
86
-
87
- curr_row[0] = row;
88
- curr_row_min = row;
89
-
90
- for (col=1; col<=l1; col++) {
91
- /* Equal (cost=0) or substitution (cost=1). */
92
-
93
- value1 = prev_row[col-1] + ((s1[offset+col-1] == s2[offset+row-1]) ? 0 : 1);
94
-
95
- /* Insertion if it's cheaper than substitution. */
96
-
97
- value2 = prev_row[col]+1;
98
- if (value2 < value1) {
99
- value1 = value2;
100
- }
101
-
102
- /* Deletion if it's cheaper than substitution. */
103
-
104
- value2 = curr_row[col-1]+1;
105
- if (value2 < value1) {
106
- value1 = value2;
107
- }
108
-
109
- /* Keep track of the minimum value on this row. */
110
-
111
- if (value1 < curr_row_min) {
112
- curr_row_min = value1;
113
- }
114
-
115
- curr_row[col] = value1;
116
- }
117
-
118
- /* Return nil as soon as we exceed the threshold. */
119
-
120
- if (threshold > -1 && curr_row_min >= threshold) {
121
- free(prev_row);
122
- free(curr_row);
123
-
124
- return Qnil;
125
- }
126
- }
127
-
128
- /* The result is the last value on the last row. */
129
-
130
- result = curr_row[l1];
131
-
132
- free(prev_row);
133
- free(curr_row);
134
-
135
- /* Return the Ruby version of the result. */
136
-
137
- return INT2FIX(result);
138
- }
@@ -1,4 +0,0 @@
1
- module Levenshtein
2
- class LevenshteinException < RuntimeError
3
- end
4
- end