text 1.3.0 → 1.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/text/levenshtein.rb +31 -20
- data/lib/text/version.rb +1 -1
- data/test/levenshtein_test.rb +17 -0
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 84969e89321dfeb30067b4b526928330964281a5
|
4
|
+
data.tar.gz: 79da28e5584e6ecd0d35c18cafb5bbd5a1e46989
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 49a658384d4f666da44a3bfea41c44632fa7a9b444ae132c079c87ac236be27ce34fb0f38c320c992aba51068c621c073f9fa7c184d883019921ea2710a6359d
|
7
|
+
data.tar.gz: cc04c2a8736339afd408514853df8fc0d8dd298be0adbbc2c6b47d0aaeceb31a534be8ca395efc34221933c7a4894cd5a5724a5da81b2cf8f14302ad178b9dd1
|
data/lib/text/levenshtein.rb
CHANGED
@@ -36,14 +36,15 @@ module Levenshtein
|
|
36
36
|
|
37
37
|
private
|
38
38
|
def distance_with_maximum(str1, str2, max_distance) # :nodoc:
|
39
|
-
s
|
40
|
-
|
39
|
+
s = str1.encode(Encoding::UTF_8).unpack("U*")
|
40
|
+
t = str2.encode(Encoding::UTF_8).unpack("U*")
|
41
|
+
|
41
42
|
n = s.length
|
42
43
|
m = t.length
|
43
44
|
big_int = n * m
|
44
|
-
|
45
|
-
|
46
|
-
|
45
|
+
|
46
|
+
# Swap if necessary so that s is always the shorter of the two strings
|
47
|
+
s, t, n, m = t, s, m, n if m < n
|
47
48
|
|
48
49
|
# If the length difference is already greater than the max_distance, then
|
49
50
|
# there is nothing else to check
|
@@ -51,6 +52,10 @@ private
|
|
51
52
|
return max_distance
|
52
53
|
end
|
53
54
|
|
55
|
+
return 0 if s == t
|
56
|
+
return m if n.zero?
|
57
|
+
return n if m.zero?
|
58
|
+
|
54
59
|
# The values necessary for our threshold are written; the ones after must
|
55
60
|
# be filled with large integers since the tailing member of the threshold
|
56
61
|
# window in the bottom array will run min across them
|
@@ -84,10 +89,12 @@ private
|
|
84
89
|
# computer science and computational biology.
|
85
90
|
# Cambridge, UK: Cambridge University Press. ISBN 0-521-58519-8.
|
86
91
|
# pp. 263–264.
|
87
|
-
min =
|
88
|
-
|
92
|
+
min = i - max_distance - 1
|
93
|
+
min = 0 if min < 0
|
94
|
+
max = i + max_distance
|
95
|
+
max = m - 1 if max > m - 1
|
89
96
|
|
90
|
-
(
|
97
|
+
min.upto(max) do |j|
|
91
98
|
# If the diagonal value is already greater than the max_distance
|
92
99
|
# then we can safety return: the diagonal will never go lower again.
|
93
100
|
# See: http://www.levenshtein.net/
|
@@ -96,11 +103,11 @@ private
|
|
96
103
|
end
|
97
104
|
|
98
105
|
cost = s[i] == t[j] ? 0 : 1
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
106
|
+
insertion = d[j + 1] + 1
|
107
|
+
deletion = e + 1
|
108
|
+
substitution = d[j] + cost
|
109
|
+
x = insertion < deletion ? insertion : deletion
|
110
|
+
x = substitution if substitution < x
|
104
111
|
|
105
112
|
d[j] = e
|
106
113
|
e = x
|
@@ -116,9 +123,12 @@ private
|
|
116
123
|
end
|
117
124
|
|
118
125
|
def distance_without_maximum(str1, str2) # :nodoc:
|
119
|
-
s
|
126
|
+
s = str1.encode(Encoding::UTF_8).unpack("U*")
|
127
|
+
t = str2.encode(Encoding::UTF_8).unpack("U*")
|
128
|
+
|
120
129
|
n = s.length
|
121
130
|
m = t.length
|
131
|
+
|
122
132
|
return m if n.zero?
|
123
133
|
return n if m.zero?
|
124
134
|
|
@@ -128,12 +138,13 @@ private
|
|
128
138
|
n.times do |i|
|
129
139
|
e = i + 1
|
130
140
|
m.times do |j|
|
131
|
-
cost =
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
141
|
+
cost = s[i] == t[j] ? 0 : 1
|
142
|
+
insertion = d[j + 1] + 1
|
143
|
+
deletion = e + 1
|
144
|
+
substitution = d[j] + cost
|
145
|
+
x = insertion < deletion ? insertion : deletion
|
146
|
+
x = substitution if substitution < x
|
147
|
+
|
137
148
|
d[j] = e
|
138
149
|
e = x
|
139
150
|
end
|
data/lib/text/version.rb
CHANGED
data/test/levenshtein_test.rb
CHANGED
@@ -48,6 +48,12 @@ class LevenshteinTest < Test::Unit::TestCase
|
|
48
48
|
assert_equal 3, distance("kitten", "sitting", 4)
|
49
49
|
end
|
50
50
|
|
51
|
+
def test_should_return_calculated_distance_when_less_than_maximum_for_empty_strings
|
52
|
+
assert_equal 3, distance("", "cat", 4)
|
53
|
+
assert_equal 3, distance("cat", "", 5)
|
54
|
+
assert_equal 0, distance("", "", 2)
|
55
|
+
end
|
56
|
+
|
51
57
|
def test_should_return_calculated_distance_when_same_as_maximum
|
52
58
|
assert_equal 0, distance("test", "test", 0)
|
53
59
|
assert_equal 1, distance("test", "tent", 1)
|
@@ -55,12 +61,23 @@ class LevenshteinTest < Test::Unit::TestCase
|
|
55
61
|
assert_equal 3, distance("kitten", "sitting", 3)
|
56
62
|
end
|
57
63
|
|
64
|
+
def test_should_return_calculated_distance_when_same_as_maximum_for_empty_strings
|
65
|
+
assert_equal 3, distance("", "cat", 3)
|
66
|
+
assert_equal 3, distance("cat", "", 3)
|
67
|
+
assert_equal 0, distance("", "", 0)
|
68
|
+
end
|
69
|
+
|
58
70
|
def test_should_return_specified_maximum_if_distance_is_more
|
59
71
|
assert_equal 1, distance("gumbo", "gambol", 1)
|
60
72
|
assert_equal 2, distance("kitten", "sitting", 2)
|
61
73
|
assert_equal 1, distance("test", "tasf", 1)
|
62
74
|
end
|
63
75
|
|
76
|
+
def test_should_return_specified_maximum_if_distance_is_more_for_empty_strings
|
77
|
+
assert_equal 2, distance("kitten", "", 2)
|
78
|
+
assert_equal 3, distance("", "kitten", 3)
|
79
|
+
end
|
80
|
+
|
64
81
|
def test_should_return_maximum_distance_for_strings_with_additions_at_start
|
65
82
|
assert_equal 1, distance("1234", "01234")
|
66
83
|
assert_equal 0, distance("1234", "01234", 0)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Paul Battley
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date:
|
13
|
+
date: 2015-04-13 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: rake
|
@@ -80,7 +80,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
80
80
|
version: '0'
|
81
81
|
requirements: []
|
82
82
|
rubyforge_project: text
|
83
|
-
rubygems_version: 2.
|
83
|
+
rubygems_version: 2.4.5
|
84
84
|
signing_key:
|
85
85
|
specification_version: 4
|
86
86
|
summary: A collection of text algorithms
|