text 1.3.0 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/text/levenshtein.rb +31 -20
- data/lib/text/version.rb +1 -1
- data/test/levenshtein_test.rb +17 -0
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 84969e89321dfeb30067b4b526928330964281a5
|
4
|
+
data.tar.gz: 79da28e5584e6ecd0d35c18cafb5bbd5a1e46989
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 49a658384d4f666da44a3bfea41c44632fa7a9b444ae132c079c87ac236be27ce34fb0f38c320c992aba51068c621c073f9fa7c184d883019921ea2710a6359d
|
7
|
+
data.tar.gz: cc04c2a8736339afd408514853df8fc0d8dd298be0adbbc2c6b47d0aaeceb31a534be8ca395efc34221933c7a4894cd5a5724a5da81b2cf8f14302ad178b9dd1
|
data/lib/text/levenshtein.rb
CHANGED
@@ -36,14 +36,15 @@ module Levenshtein
|
|
36
36
|
|
37
37
|
private
|
38
38
|
def distance_with_maximum(str1, str2, max_distance) # :nodoc:
|
39
|
-
s
|
40
|
-
|
39
|
+
s = str1.encode(Encoding::UTF_8).unpack("U*")
|
40
|
+
t = str2.encode(Encoding::UTF_8).unpack("U*")
|
41
|
+
|
41
42
|
n = s.length
|
42
43
|
m = t.length
|
43
44
|
big_int = n * m
|
44
|
-
|
45
|
-
|
46
|
-
|
45
|
+
|
46
|
+
# Swap if necessary so that s is always the shorter of the two strings
|
47
|
+
s, t, n, m = t, s, m, n if m < n
|
47
48
|
|
48
49
|
# If the length difference is already greater than the max_distance, then
|
49
50
|
# there is nothing else to check
|
@@ -51,6 +52,10 @@ private
|
|
51
52
|
return max_distance
|
52
53
|
end
|
53
54
|
|
55
|
+
return 0 if s == t
|
56
|
+
return m if n.zero?
|
57
|
+
return n if m.zero?
|
58
|
+
|
54
59
|
# The values necessary for our threshold are written; the ones after must
|
55
60
|
# be filled with large integers since the tailing member of the threshold
|
56
61
|
# window in the bottom array will run min across them
|
@@ -84,10 +89,12 @@ private
|
|
84
89
|
# computer science and computational biology.
|
85
90
|
# Cambridge, UK: Cambridge University Press. ISBN 0-521-58519-8.
|
86
91
|
# pp. 263–264.
|
87
|
-
min =
|
88
|
-
|
92
|
+
min = i - max_distance - 1
|
93
|
+
min = 0 if min < 0
|
94
|
+
max = i + max_distance
|
95
|
+
max = m - 1 if max > m - 1
|
89
96
|
|
90
|
-
(
|
97
|
+
min.upto(max) do |j|
|
91
98
|
# If the diagonal value is already greater than the max_distance
|
92
99
|
# then we can safety return: the diagonal will never go lower again.
|
93
100
|
# See: http://www.levenshtein.net/
|
@@ -96,11 +103,11 @@ private
|
|
96
103
|
end
|
97
104
|
|
98
105
|
cost = s[i] == t[j] ? 0 : 1
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
106
|
+
insertion = d[j + 1] + 1
|
107
|
+
deletion = e + 1
|
108
|
+
substitution = d[j] + cost
|
109
|
+
x = insertion < deletion ? insertion : deletion
|
110
|
+
x = substitution if substitution < x
|
104
111
|
|
105
112
|
d[j] = e
|
106
113
|
e = x
|
@@ -116,9 +123,12 @@ private
|
|
116
123
|
end
|
117
124
|
|
118
125
|
def distance_without_maximum(str1, str2) # :nodoc:
|
119
|
-
s
|
126
|
+
s = str1.encode(Encoding::UTF_8).unpack("U*")
|
127
|
+
t = str2.encode(Encoding::UTF_8).unpack("U*")
|
128
|
+
|
120
129
|
n = s.length
|
121
130
|
m = t.length
|
131
|
+
|
122
132
|
return m if n.zero?
|
123
133
|
return n if m.zero?
|
124
134
|
|
@@ -128,12 +138,13 @@ private
|
|
128
138
|
n.times do |i|
|
129
139
|
e = i + 1
|
130
140
|
m.times do |j|
|
131
|
-
cost =
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
141
|
+
cost = s[i] == t[j] ? 0 : 1
|
142
|
+
insertion = d[j + 1] + 1
|
143
|
+
deletion = e + 1
|
144
|
+
substitution = d[j] + cost
|
145
|
+
x = insertion < deletion ? insertion : deletion
|
146
|
+
x = substitution if substitution < x
|
147
|
+
|
137
148
|
d[j] = e
|
138
149
|
e = x
|
139
150
|
end
|
data/lib/text/version.rb
CHANGED
data/test/levenshtein_test.rb
CHANGED
@@ -48,6 +48,12 @@ class LevenshteinTest < Test::Unit::TestCase
|
|
48
48
|
assert_equal 3, distance("kitten", "sitting", 4)
|
49
49
|
end
|
50
50
|
|
51
|
+
def test_should_return_calculated_distance_when_less_than_maximum_for_empty_strings
|
52
|
+
assert_equal 3, distance("", "cat", 4)
|
53
|
+
assert_equal 3, distance("cat", "", 5)
|
54
|
+
assert_equal 0, distance("", "", 2)
|
55
|
+
end
|
56
|
+
|
51
57
|
def test_should_return_calculated_distance_when_same_as_maximum
|
52
58
|
assert_equal 0, distance("test", "test", 0)
|
53
59
|
assert_equal 1, distance("test", "tent", 1)
|
@@ -55,12 +61,23 @@ class LevenshteinTest < Test::Unit::TestCase
|
|
55
61
|
assert_equal 3, distance("kitten", "sitting", 3)
|
56
62
|
end
|
57
63
|
|
64
|
+
def test_should_return_calculated_distance_when_same_as_maximum_for_empty_strings
|
65
|
+
assert_equal 3, distance("", "cat", 3)
|
66
|
+
assert_equal 3, distance("cat", "", 3)
|
67
|
+
assert_equal 0, distance("", "", 0)
|
68
|
+
end
|
69
|
+
|
58
70
|
def test_should_return_specified_maximum_if_distance_is_more
|
59
71
|
assert_equal 1, distance("gumbo", "gambol", 1)
|
60
72
|
assert_equal 2, distance("kitten", "sitting", 2)
|
61
73
|
assert_equal 1, distance("test", "tasf", 1)
|
62
74
|
end
|
63
75
|
|
76
|
+
def test_should_return_specified_maximum_if_distance_is_more_for_empty_strings
|
77
|
+
assert_equal 2, distance("kitten", "", 2)
|
78
|
+
assert_equal 3, distance("", "kitten", 3)
|
79
|
+
end
|
80
|
+
|
64
81
|
def test_should_return_maximum_distance_for_strings_with_additions_at_start
|
65
82
|
assert_equal 1, distance("1234", "01234")
|
66
83
|
assert_equal 0, distance("1234", "01234", 0)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Paul Battley
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date:
|
13
|
+
date: 2015-04-13 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: rake
|
@@ -80,7 +80,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
80
80
|
version: '0'
|
81
81
|
requirements: []
|
82
82
|
rubyforge_project: text
|
83
|
-
rubygems_version: 2.
|
83
|
+
rubygems_version: 2.4.5
|
84
84
|
signing_key:
|
85
85
|
specification_version: 4
|
86
86
|
summary: A collection of text algorithms
|