text 1.2.3 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/COPYING.txt +21 -0
- data/README.rdoc +4 -0
- data/Rakefile +6 -0
- data/lib/text/double_metaphone.rb +1 -1
- data/lib/text/levenshtein.rb +96 -18
- data/lib/text/metaphone.rb +2 -2
- data/lib/text/soundex.rb +2 -4
- data/lib/text/version.rb +2 -2
- data/test/data/{metaphone.txt → metaphone.yml} +2 -0
- data/test/data/{metaphone_buggy.txt → metaphone_buggy.yml} +0 -0
- data/test/data/soundex.yml +14 -0
- data/test/double_metaphone_test.rb +17 -0
- data/test/levenshtein_test.rb +339 -69
- data/test/metaphone_test.rb +3 -3
- data/test/porter_stemming_test.rb +1 -1
- data/test/soundex_test.rb +9 -17
- data/test/text_test.rb +1 -1
- data/test/white_similarity_test.rb +1 -1
- metadata +32 -35
- data/test/data/big.txt +0 -8
- data/test/data/chunky.flf +0 -512
- data/test/data/chunky.txt +0 -5
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 990ce640247dc172ad5e84a54bec8349ccdf55bb
|
4
|
+
data.tar.gz: 25c46a8bfe4fd410959be538b10b2a288e70cbcb
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 6d506f2ae153149a6ebd288ce61a540931fd5d944bc661a582ba8a1fffbdaaf454873e690308202826127d8849c3ec8892969100ad224c51cf61aa5d20fde48c
|
7
|
+
data.tar.gz: 5ffcc3933066fb5131f7da6b438f89fec87d047bc2703e5ad7b4d0ed3a433c4de8c1862c66b8ff7b6dbdcb7aa7e9646c9f8594cd3e6a0630e7a25b0b7bf8e7fb
|
data/COPYING.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
== Licence (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2006-2013 Paul Battley, Michael Neumann, Tim Fletcher
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.rdoc
CHANGED
data/Rakefile
CHANGED
data/lib/text/levenshtein.rb
CHANGED
@@ -16,29 +16,107 @@ module Levenshtein
|
|
16
16
|
|
17
17
|
# Calculate the Levenshtein distance between two strings +str1+ and +str2+.
|
18
18
|
#
|
19
|
+
# The optional argument max_distance can reduce the number of iterations by
|
20
|
+
# stopping if the Levenshtein distance exceeds this value. This increases
|
21
|
+
# performance where it is only necessary to compare the distance with a
|
22
|
+
# reference value instead of calculating the exact distance.
|
19
23
|
#
|
20
|
-
#
|
21
|
-
#
|
22
|
-
#
|
23
|
-
#
|
24
|
-
# strings.
|
24
|
+
# The distance is calculated in terms of Unicode codepoints. Be aware that
|
25
|
+
# this algorithm does not perform normalisation: if there is a possibility
|
26
|
+
# of different normalised forms being used, normalisation should be performed
|
27
|
+
# beforehand.
|
25
28
|
#
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
29
|
+
def distance(str1, str2, max_distance = nil)
|
30
|
+
if max_distance
|
31
|
+
distance_with_maximum(str1, str2, max_distance)
|
32
|
+
else
|
33
|
+
distance_without_maximum(str1, str2)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
def distance_with_maximum(str1, str2, max_distance) # :nodoc:
|
39
|
+
s, t = [str1, str2].sort_by(&:length).
|
40
|
+
map{ |str| str.encode(Encoding::UTF_8).unpack("U*") }
|
41
|
+
n = s.length
|
42
|
+
m = t.length
|
43
|
+
big_int = n * m
|
44
|
+
return m if n.zero?
|
45
|
+
return n if m.zero?
|
46
|
+
return 0 if s == t
|
47
|
+
|
48
|
+
# If the length difference is already greater than the max_distance, then
|
49
|
+
# there is nothing else to check
|
50
|
+
if (n - m).abs >= max_distance
|
51
|
+
return max_distance
|
52
|
+
end
|
53
|
+
|
54
|
+
# The values necessary for our threshold are written; the ones after must
|
55
|
+
# be filled with large integers since the tailing member of the threshold
|
56
|
+
# window in the bottom array will run min across them
|
57
|
+
d = (m + 1).times.map { |i|
|
58
|
+
if i < m || i < max_distance + 1
|
59
|
+
i
|
36
60
|
else
|
37
|
-
|
38
|
-
lambda { |str| str.unpack(rule) }
|
61
|
+
big_int
|
39
62
|
end
|
63
|
+
}
|
64
|
+
x = nil
|
65
|
+
e = nil
|
66
|
+
|
67
|
+
n.times do |i|
|
68
|
+
# Since we're reusing arrays, we need to be sure to wipe the value left
|
69
|
+
# of the starting index; we don't have to worry about the value above the
|
70
|
+
# ending index as the arrays were initially filled with large integers
|
71
|
+
# and we progress to the right
|
72
|
+
if e.nil?
|
73
|
+
e = i + 1
|
74
|
+
else
|
75
|
+
e = big_int
|
76
|
+
end
|
77
|
+
|
78
|
+
diag_index = t.length - s.length + i
|
79
|
+
|
80
|
+
# If max_distance was specified, we can reduce second loop. So we set
|
81
|
+
# up our threshold window.
|
82
|
+
# See:
|
83
|
+
# Gusfield, Dan (1997). Algorithms on strings, trees, and sequences:
|
84
|
+
# computer science and computational biology.
|
85
|
+
# Cambridge, UK: Cambridge University Press. ISBN 0-521-58519-8.
|
86
|
+
# pp. 263–264.
|
87
|
+
min = [0, i - max_distance - 1].max
|
88
|
+
max = [m - 1, i + max_distance].min
|
89
|
+
|
90
|
+
(min .. max).each do |j|
|
91
|
+
# If the diagonal value is already greater than the max_distance
|
92
|
+
# then we can safety return: the diagonal will never go lower again.
|
93
|
+
# See: http://www.levenshtein.net/
|
94
|
+
if j == diag_index && d[j] >= max_distance
|
95
|
+
return max_distance
|
96
|
+
end
|
97
|
+
|
98
|
+
cost = s[i] == t[j] ? 0 : 1
|
99
|
+
x = [
|
100
|
+
d[j+1] + 1, # insertion
|
101
|
+
e + 1, # deletion
|
102
|
+
d[j] + cost # substitution
|
103
|
+
].min
|
104
|
+
|
105
|
+
d[j] = e
|
106
|
+
e = x
|
107
|
+
end
|
108
|
+
d[m] = x
|
109
|
+
end
|
110
|
+
|
111
|
+
if x > max_distance
|
112
|
+
return max_distance
|
113
|
+
else
|
114
|
+
return x
|
115
|
+
end
|
116
|
+
end
|
40
117
|
|
41
|
-
|
118
|
+
def distance_without_maximum(str1, str2) # :nodoc:
|
119
|
+
s, t = [str1, str2].map{ |str| str.encode(Encoding::UTF_8).unpack("U*") }
|
42
120
|
n = s.length
|
43
121
|
m = t.length
|
44
122
|
return m if n.zero?
|
data/lib/text/metaphone.rb
CHANGED
@@ -45,7 +45,7 @@ module Metaphone
|
|
45
45
|
[ /c(?=ia)/, 'X' ],
|
46
46
|
[ /[st](?=i[ao])/, 'X' ],
|
47
47
|
[ /s?c(?=[iey])/, 'S' ],
|
48
|
-
[ /
|
48
|
+
[ /(ck?|q)/, 'K' ],
|
49
49
|
[ /dg(?=[iey])/, 'J' ],
|
50
50
|
[ /d/, 'T' ],
|
51
51
|
[ /g(?=h[^aeiou])/, '' ],
|
@@ -94,4 +94,4 @@ private
|
|
94
94
|
extend self
|
95
95
|
|
96
96
|
end
|
97
|
-
end
|
97
|
+
end
|
data/lib/text/soundex.rb
CHANGED
@@ -27,9 +27,9 @@ module Soundex
|
|
27
27
|
# do not change the parameter "str"
|
28
28
|
#
|
29
29
|
def soundex_str(str)
|
30
|
+
str = str.upcase.gsub(/[^A-Z]/, "")
|
30
31
|
return nil if str.empty?
|
31
32
|
|
32
|
-
str = str.upcase
|
33
33
|
last_code = get_code(str[0,1])
|
34
34
|
soundex_code = str[0,1]
|
35
35
|
|
@@ -40,15 +40,13 @@ module Soundex
|
|
40
40
|
|
41
41
|
if code == "0" then
|
42
42
|
last_code = nil
|
43
|
-
elsif code == nil then
|
44
|
-
return nil
|
45
43
|
elsif code != last_code then
|
46
44
|
soundex_code += code
|
47
45
|
last_code = code
|
48
46
|
end
|
49
47
|
end # for
|
50
48
|
|
51
|
-
return soundex_code
|
49
|
+
return soundex_code.ljust(4, "0")
|
52
50
|
end
|
53
51
|
module_function :soundex_str
|
54
52
|
|
data/lib/text/version.rb
CHANGED
File without changes
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require_relative "./test_helper"
|
2
|
+
require "text/double_metaphone"
|
3
|
+
|
4
|
+
require 'csv'
|
5
|
+
|
6
|
+
class DoubleMetaphoneTest < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def test_cases
|
9
|
+
CSV.open(data_file_path('double_metaphone.csv'), 'r').to_a.each do |row|
|
10
|
+
primary, secondary = Text::Metaphone.double_metaphone(row[0])
|
11
|
+
|
12
|
+
assert_equal row[1], primary
|
13
|
+
assert_equal row[2], secondary.nil?? primary : secondary
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
data/test/levenshtein_test.rb
CHANGED
@@ -1,93 +1,363 @@
|
|
1
|
-
|
1
|
+
# coding: UTF-8
|
2
|
+
|
3
|
+
require_relative "./test_helper"
|
2
4
|
require "text/levenshtein"
|
3
5
|
|
4
6
|
class LevenshteinTest < Test::Unit::TestCase
|
5
|
-
|
6
7
|
include Text::Levenshtein
|
7
8
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
['test', 'tent', 1],
|
12
|
-
['gumbo', 'gambol', 2],
|
13
|
-
['kitten', 'sitting', 3]
|
14
|
-
],
|
15
|
-
:empty => [
|
16
|
-
['foo', '', 3],
|
17
|
-
['', '', 0],
|
18
|
-
['a', '', 1]
|
19
|
-
],
|
20
|
-
:utf8 => [
|
21
|
-
["f\303\266o", 'foo', 1],
|
22
|
-
["fran\303\247ais", 'francais', 1],
|
23
|
-
["fran\303\247ais", "fran\303\246ais", 1],
|
24
|
-
[
|
25
|
-
"\347\247\201\343\201\256\345\220\215\345\211\215\343\201\257"<<
|
26
|
-
"\343\203\235\343\203\274\343\203\253\343\201\247\343\201\231",
|
27
|
-
"\343\201\274\343\201\217\343\201\256\345\220\215\345\211\215\343\201"<<
|
28
|
-
"\257\343\203\235\343\203\274\343\203\253\343\201\247\343\201\231",
|
29
|
-
2
|
30
|
-
] # Japanese
|
31
|
-
],
|
32
|
-
:iso_8859_1 => [
|
33
|
-
["f\366o", 'foo', 1],
|
34
|
-
["fran\347ais", 'francais', 1],
|
35
|
-
["fran\347ais", "fran\346ais", 1]
|
36
|
-
],
|
37
|
-
:edge => [
|
38
|
-
['a', 'a', 0],
|
39
|
-
['0123456789', 'abcdefghijklmnopqrstuvwxyz', 26]
|
40
|
-
]
|
41
|
-
}
|
42
|
-
|
43
|
-
def assert_set(name)
|
44
|
-
TEST_CASES[name].each do |s, t, x|
|
45
|
-
if defined?(Encoding) && Encoding.default_internal # Change the encoding if in 1.9
|
46
|
-
t.force_encoding(Encoding.default_internal)
|
47
|
-
s.force_encoding(Encoding.default_internal)
|
48
|
-
end
|
9
|
+
def iso_8859_1(s)
|
10
|
+
s.force_encoding(Encoding::ISO_8859_1)
|
11
|
+
end
|
49
12
|
|
50
|
-
|
51
|
-
|
52
|
-
|
13
|
+
def test_should_calculate_lengths_for_basic_examples
|
14
|
+
assert_equal 0, distance("test", "test")
|
15
|
+
assert_equal 1, distance("test", "tent")
|
16
|
+
assert_equal 2, distance("gumbo", "gambol")
|
17
|
+
assert_equal 3, distance("kitten", "sitting")
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_should_give_full_distances_for_empty_strings
|
21
|
+
assert_equal 3, distance("foo", "")
|
22
|
+
assert_equal 0, distance("", "")
|
23
|
+
assert_equal 1, distance("a", "")
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_should_treat_utf_8_codepoints_as_one_element
|
27
|
+
assert_equal 1, distance("föo", "foo")
|
28
|
+
assert_equal 1, distance("français", "francais")
|
29
|
+
assert_equal 1, distance("français", "franæais")
|
30
|
+
assert_equal 2, distance("私の名前はポールです", "ぼくの名前はポールです")
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_should_process_single_byte_encodings
|
34
|
+
assert_equal 1, distance(iso_8859_1("f\xF6o"), iso_8859_1("foo"))
|
35
|
+
assert_equal 1, distance(iso_8859_1("fran\xE7ais"), iso_8859_1("francais"))
|
36
|
+
assert_equal 1, distance(iso_8859_1("fran\xE7ais"), iso_8859_1("fran\xE6ais"))
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_should_process_edge_cases_as_expected
|
40
|
+
assert_equal 0, distance("a", "a")
|
41
|
+
assert_equal 26, distance("0123456789", "abcdefghijklmnopqrstuvwxyz")
|
42
|
+
end
|
43
|
+
|
44
|
+
def test_should_return_calculated_distance_when_less_than_maximum
|
45
|
+
assert_equal 0, distance("test", "test", 1)
|
46
|
+
assert_equal 1, distance("test", "tent", 2)
|
47
|
+
assert_equal 2, distance("gumbo", "gambol", 3)
|
48
|
+
assert_equal 3, distance("kitten", "sitting", 4)
|
49
|
+
end
|
50
|
+
|
51
|
+
def test_should_return_calculated_distance_when_same_as_maximum
|
52
|
+
assert_equal 0, distance("test", "test", 0)
|
53
|
+
assert_equal 1, distance("test", "tent", 1)
|
54
|
+
assert_equal 2, distance("gumbo", "gambol", 2)
|
55
|
+
assert_equal 3, distance("kitten", "sitting", 3)
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_should_return_specified_maximum_if_distance_is_more
|
59
|
+
assert_equal 1, distance("gumbo", "gambol", 1)
|
60
|
+
assert_equal 2, distance("kitten", "sitting", 2)
|
61
|
+
assert_equal 1, distance("test", "tasf", 1)
|
62
|
+
end
|
63
|
+
|
64
|
+
def test_should_return_maximum_distance_for_strings_with_additions_at_start
|
65
|
+
assert_equal 1, distance("1234", "01234")
|
66
|
+
assert_equal 0, distance("1234", "01234", 0)
|
67
|
+
assert_equal 1, distance("1234", "01234", 1)
|
68
|
+
assert_equal 1, distance("1234", "01234", 2)
|
69
|
+
assert_equal 1, distance("1234", "01234", 3)
|
70
|
+
assert_equal 1, distance("1234", "01234", 5)
|
71
|
+
end
|
72
|
+
|
73
|
+
def test_should_return_maximum_distance_for_strings_with_additions_at_end
|
74
|
+
assert_equal 2, distance("1234", "123400")
|
75
|
+
assert_equal 0, distance("1234", "123400", 0)
|
76
|
+
assert_equal 1, distance("1234", "123400", 1)
|
77
|
+
assert_equal 2, distance("1234", "123400", 2)
|
78
|
+
assert_equal 2, distance("1234", "123400", 3)
|
79
|
+
assert_equal 2, distance("1234", "123400", 5)
|
80
|
+
end
|
81
|
+
|
82
|
+
def test_should_return_maximum_distance_for_strings_with_additions_in_the_middle
|
83
|
+
assert_equal 1, distance("1234", "12034")
|
84
|
+
assert_equal 0, distance("1234", "12034", 0)
|
85
|
+
assert_equal 1, distance("1234", "12034", 1)
|
86
|
+
assert_equal 1, distance("1234", "12034", 2)
|
87
|
+
assert_equal 1, distance("1234", "12034", 5)
|
88
|
+
end
|
89
|
+
|
90
|
+
def test_should_return_maximum_distance_for_strings_with_additions_at_start_and_in_the_middle
|
91
|
+
assert_equal 2, distance("1234", "012034")
|
92
|
+
assert_equal 0, distance("1234", "012034", 0)
|
93
|
+
assert_equal 1, distance("1234", "012034", 1)
|
94
|
+
assert_equal 2, distance("1234", "012034", 2)
|
95
|
+
assert_equal 2, distance("1234", "012034", 3)
|
96
|
+
assert_equal 2, distance("1234", "012034", 5)
|
97
|
+
end
|
98
|
+
|
99
|
+
def test_should_return_maximum_distance_for_strings_with_additions_at_end_and_in_the_middle
|
100
|
+
assert_equal 2, distance("1234", "120340")
|
101
|
+
assert_equal 0, distance("1234", "120340", 0)
|
102
|
+
assert_equal 1, distance("1234", "120340", 1)
|
103
|
+
assert_equal 2, distance("1234", "120340", 2)
|
104
|
+
assert_equal 2, distance("1234", "120340", 3)
|
105
|
+
assert_equal 2, distance("1234", "120340", 5)
|
106
|
+
end
|
107
|
+
|
108
|
+
def test_should_return_maximum_distance_for_strings_with_additions_at_start_at_end_and_in_the_middle
|
109
|
+
assert_equal 3, distance("1234", "0120340")
|
110
|
+
assert_equal 0, distance("1234", "0120340", 0)
|
111
|
+
assert_equal 3, distance("1234", "0120340", 3)
|
112
|
+
assert_equal 3, distance("1234", "0120340", 4)
|
113
|
+
assert_equal 3, distance("1234", "0120340", 6)
|
114
|
+
end
|
115
|
+
|
116
|
+
def test_should_return_maximum_distance_for_strings_with_additions_at_start_and_char_changes
|
117
|
+
assert_equal 3, distance("1234", "001233")
|
118
|
+
assert_equal 0, distance("1234", "001233", 0)
|
119
|
+
assert_equal 2, distance("1234", "001233", 2)
|
120
|
+
assert_equal 3, distance("1234", "001233", 3)
|
121
|
+
assert_equal 3, distance("1234", "001233", 4)
|
122
|
+
assert_equal 3, distance("1234", "001233", 5)
|
123
|
+
end
|
124
|
+
|
125
|
+
def test_should_return_maximum_distance_for_strings_with_deletions_at_end
|
126
|
+
assert_equal 1, distance("1234", "123")
|
127
|
+
assert_equal 0, distance("1234", "123", 0)
|
128
|
+
assert_equal 1, distance("1234", "123", 1)
|
129
|
+
assert_equal 1, distance("1234", "123", 2)
|
130
|
+
assert_equal 1, distance("1234", "123", 5)
|
131
|
+
end
|
132
|
+
|
133
|
+
def test_should_return_maximum_distance_for_strings_with_deletions_at_start
|
134
|
+
assert_equal 1, distance("1234", "234")
|
135
|
+
assert_equal 0, distance("1234", "234", 0)
|
136
|
+
assert_equal 1, distance("1234", "234", 1)
|
137
|
+
assert_equal 1, distance("1234", "234", 2)
|
138
|
+
assert_equal 1, distance("1234", "234", 5)
|
139
|
+
end
|
140
|
+
|
141
|
+
def test_should_return_maximum_distance_for_strings_with_deletions_at_start_and_in_the_middle
|
142
|
+
assert_equal 2, distance("1234", "24")
|
143
|
+
assert_equal 0, distance("1234", "24", 0)
|
144
|
+
assert_equal 1, distance("1234", "24", 1)
|
145
|
+
assert_equal 2, distance("1234", "24", 2)
|
146
|
+
assert_equal 2, distance("1234", "24", 3)
|
147
|
+
assert_equal 2, distance("1234", "24", 5)
|
148
|
+
end
|
149
|
+
|
150
|
+
def test_should_return_maximum_distance_for_strings_with_deletions_at_end_and_in_the_middle
|
151
|
+
assert_equal 2, distance("1234", "13")
|
152
|
+
assert_equal 0, distance("1234", "13", 0)
|
153
|
+
assert_equal 1, distance("1234", "13", 1)
|
154
|
+
assert_equal 2, distance("1234", "13", 2)
|
155
|
+
assert_equal 2, distance("1234", "13", 3)
|
156
|
+
assert_equal 2, distance("1234", "13", 5)
|
157
|
+
end
|
158
|
+
|
159
|
+
def test_should_return_maximum_distance_for_strings_with_deletions_at_start_at_end_and_in_the_middle
|
160
|
+
assert_equal 3, distance("12345", "24")
|
161
|
+
assert_equal 0, distance("12345", "24", 0)
|
162
|
+
assert_equal 2, distance("12345", "24", 2)
|
163
|
+
assert_equal 3, distance("12345", "24", 3)
|
164
|
+
assert_equal 3, distance("12345", "24", 4)
|
165
|
+
assert_equal 3, distance("12345", "24", 5)
|
166
|
+
end
|
167
|
+
|
168
|
+
def test_should_return_maximum_distance_for_strings_with_additions_at_start_and_deletions_in_the_middle
|
169
|
+
assert_equal 2, distance("1234", "0124")
|
170
|
+
assert_equal 0, distance("1234", "0124", 0)
|
171
|
+
assert_equal 1, distance("1234", "0124", 1)
|
172
|
+
assert_equal 2, distance("1234", "0124", 2)
|
173
|
+
assert_equal 2, distance("1234", "0124", 3)
|
174
|
+
assert_equal 2, distance("1234", "0124", 5)
|
175
|
+
end
|
176
|
+
|
177
|
+
def test_should_return_maximum_distance_for_strings_with_additions_at_start_and_deletions_at_end
|
178
|
+
assert_equal 2, distance("1234", "0123")
|
179
|
+
assert_equal 0, distance("1234", "0123", 0)
|
180
|
+
assert_equal 1, distance("1234", "0123", 1)
|
181
|
+
assert_equal 2, distance("1234", "0123", 2)
|
182
|
+
assert_equal 2, distance("1234", "0123", 3)
|
183
|
+
assert_equal 2, distance("1234", "0123", 5)
|
184
|
+
end
|
185
|
+
|
186
|
+
def test_should_return_maximum_distance_for_strings_with_additions_in_the_middle_and_deletions_at_end
|
187
|
+
assert_equal 2, distance("1234", "1293")
|
188
|
+
assert_equal 0, distance("1234", "1293", 0)
|
189
|
+
assert_equal 1, distance("1234", "1293", 1)
|
190
|
+
assert_equal 2, distance("1234", "1293", 2)
|
191
|
+
assert_equal 2, distance("1234", "1293", 3)
|
192
|
+
assert_equal 2, distance("1234", "1293", 5)
|
193
|
+
end
|
194
|
+
|
195
|
+
def test_should_return_maximum_distance_for_strings_with_additions_in_the_middle_and_deletions_at_start
|
196
|
+
assert_equal 2, distance("1234", "2934")
|
197
|
+
assert_equal 0, distance("1234", "2934", 0)
|
198
|
+
assert_equal 1, distance("1234", "2934", 1)
|
199
|
+
assert_equal 2, distance("1234", "2934", 2)
|
200
|
+
assert_equal 2, distance("1234", "2934", 3)
|
201
|
+
assert_equal 2, distance("1234", "2934", 5)
|
202
|
+
end
|
203
|
+
|
204
|
+
def test_should_return_maximum_distance_for_strings_with_additions_at_end_and_deletions_at_start
|
205
|
+
assert_equal 2, distance("1234", "2345")
|
206
|
+
assert_equal 0, distance("1234", "2345", 0)
|
207
|
+
assert_equal 1, distance("1234", "2345", 1)
|
208
|
+
assert_equal 2, distance("1234", "2345", 2)
|
209
|
+
assert_equal 2, distance("1234", "2345", 3)
|
210
|
+
assert_equal 2, distance("1234", "2345", 5)
|
211
|
+
end
|
212
|
+
|
213
|
+
def test_should_return_maximum_distance_for_strings_with_additions_at_end_and_deletions_in_the_middle
|
214
|
+
assert_equal 2, distance("1234", "1245")
|
215
|
+
assert_equal 0, distance("1234", "1245", 0)
|
216
|
+
assert_equal 1, distance("1234", "1245", 1)
|
217
|
+
assert_equal 2, distance("1234", "1245", 2)
|
218
|
+
assert_equal 2, distance("1234", "1245", 3)
|
219
|
+
assert_equal 2, distance("1234", "1245", 5)
|
220
|
+
end
|
221
|
+
|
222
|
+
def test_should_return_maximum_distance_for_strings_with_additions_in_the_middle_and_deletions_in_the_middle
|
223
|
+
assert_equal 2, distance("12345", "12035")
|
224
|
+
assert_equal 0, distance("12345", "12035", 0)
|
225
|
+
assert_equal 1, distance("12345", "12035", 1)
|
226
|
+
assert_equal 2, distance("12345", "12035", 2)
|
227
|
+
assert_equal 2, distance("12345", "12035", 3)
|
228
|
+
assert_equal 2, distance("12345", "12035", 5)
|
229
|
+
end
|
230
|
+
|
231
|
+
def test_should_return_maximum_distance_for_strings_with_additions_deletions_and_char_changes
|
232
|
+
assert_equal 3, distance("1234", "0193")
|
233
|
+
assert_equal 0, distance("1234", "0193", 0)
|
234
|
+
assert_equal 1, distance("1234", "0193", 1)
|
235
|
+
assert_equal 2, distance("1234", "0193", 2)
|
236
|
+
assert_equal 3, distance("1234", "0193", 3)
|
237
|
+
assert_equal 3, distance("1234", "0193", 4)
|
238
|
+
assert_equal 3, distance("1234", "0193", 5)
|
239
|
+
|
240
|
+
assert_equal 3, distance("1234", "2395")
|
241
|
+
assert_equal 0, distance("1234", "2395", 0)
|
242
|
+
assert_equal 1, distance("1234", "2395", 1)
|
243
|
+
assert_equal 2, distance("1234", "2395", 2)
|
244
|
+
assert_equal 3, distance("1234", "2395", 3)
|
245
|
+
assert_equal 3, distance("1234", "2395", 4)
|
246
|
+
assert_equal 3, distance("1234", "2395", 5)
|
247
|
+
end
|
248
|
+
|
249
|
+
def test_should_return_maximum_distance_for_strings_with_only_one_char
|
250
|
+
assert_equal 1, distance("t", "a")
|
251
|
+
assert_equal 0, distance("t", "a", 0)
|
252
|
+
assert_equal 1, distance("t", "a", 1)
|
253
|
+
assert_equal 1, distance("t", "a", 2)
|
254
|
+
assert_equal 1, distance("t", "a", 10)
|
255
|
+
|
256
|
+
assert_equal 0, distance("t", "t")
|
257
|
+
assert_equal 0, distance("t", "t", 1)
|
258
|
+
assert_equal 0, distance("t", "t", 4)
|
259
|
+
|
260
|
+
assert_equal 1, distance("te", "t")
|
261
|
+
assert_equal 0, distance("te", "t", 0)
|
262
|
+
assert_equal 1, distance("te", "t", 1)
|
263
|
+
assert_equal 1, distance("te", "t", 2)
|
264
|
+
assert_equal 1, distance("te", "t", 4)
|
265
|
+
end
|
266
|
+
|
267
|
+
def test_should_return_maximum_distance_for_a_long_string
|
268
|
+
assert_equal 440, distance( "Having a catchy name, easy reminder for all is fundamental when choosing the name for a new product. A bad name can be the beginning of the end product and immediately forget this.</p> <p>Primary keys to choose a good brand name are, first: choose a name that only has one word and at most three, such being the optimum. Try to make it easier to read and pronounce, as this will be easier to remember for all the time to talk about your product. Remember, too, that the use of capitalization also influence, you should treat the name of your product as if it were the same logo. And finally, you should avoid using numbers in your product name, unless it is a very easy to remember because this number were tied deeply with your product. Always think globally, independent of which only sell locally, you never know when it can come out in sales and need to make a point.",
|
269
|
+
"All product lines work with tags that identify its products and differentiate it from the others or with labels for packaged, or perhaps labels to be placed in the envelopes that you send to your customers. There are thousands options, shapes, designs and colors that you can use and advantage of these is that they can also be adhesive. If you need a label that serve you and that you identify will have your order. You will receive many proposals that you can discard if they don't like you or you keep it if you like and fits your needs. Don't miss the opportunity to innovate and use all the tools that allow you to continue to grow as a company. REMEMBER! a good label, with a good design can increase your sales by 20% just by its appearance.",
|
270
|
+
440 )
|
53
271
|
end
|
54
272
|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
else # 1.8 backwards compat
|
62
|
-
old_kcode = $KCODE
|
63
|
-
$KCODE = kcode
|
64
|
-
yield
|
65
|
-
$KCODE = old_kcode
|
273
|
+
end
|
274
|
+
|
275
|
+
class LevenshteinGeneratedDataTest < Test::Unit::TestCase
|
276
|
+
Element = Struct.new(:char, :added) do
|
277
|
+
def to_s
|
278
|
+
char
|
66
279
|
end
|
67
280
|
end
|
68
281
|
|
69
|
-
def
|
70
|
-
|
282
|
+
def one_of(str)
|
283
|
+
str[rand(str.length)]
|
284
|
+
end
|
285
|
+
|
286
|
+
def letter
|
287
|
+
one_of "abcdefghijklmnopqrstuvwxyzáéíóúあいうえお日月火水木"
|
288
|
+
end
|
289
|
+
|
290
|
+
def word
|
291
|
+
(rand(10) + 2).times.map { letter }.join("")
|
292
|
+
end
|
293
|
+
|
294
|
+
def sentence
|
295
|
+
(rand(10) + 2).times.map { word }.join(" ")
|
71
296
|
end
|
72
297
|
|
73
|
-
def
|
74
|
-
|
298
|
+
def sequence
|
299
|
+
sentence.scan(/./).map { |c| Element.new(c, true) }
|
75
300
|
end
|
76
301
|
|
77
|
-
def
|
78
|
-
|
302
|
+
def insert(seq)
|
303
|
+
elem = Element.new(letter, true)
|
304
|
+
pos = rand(seq.length)
|
305
|
+
return [seq[0, pos] + [elem] + seq[pos .. -1], 1]
|
79
306
|
end
|
80
307
|
|
81
|
-
|
82
|
-
|
83
|
-
|
308
|
+
# Delete an element, but only if we didn't add it - that would make the
|
309
|
+
# calculations complicated
|
310
|
+
def delete(seq)
|
311
|
+
pos = rand(seq.length)
|
312
|
+
if seq[pos].added
|
313
|
+
return [seq, 0]
|
314
|
+
else
|
315
|
+
return [seq[0, pos] + seq[(pos + 1) .. -1], 1]
|
84
316
|
end
|
85
317
|
end
|
86
318
|
|
87
|
-
def
|
88
|
-
|
89
|
-
|
319
|
+
def substitute(seq)
|
320
|
+
pos = rand(seq.length)
|
321
|
+
if seq[pos].added
|
322
|
+
return [seq, 0]
|
323
|
+
else
|
324
|
+
elem = Element.new(letter, false)
|
325
|
+
return [seq[0, pos] + [elem] + se[(pos + 1) .. -1], 1]
|
90
326
|
end
|
91
327
|
end
|
92
328
|
|
329
|
+
def mutate(seq)
|
330
|
+
distance = 0
|
331
|
+
rand(seq.length).times do
|
332
|
+
method = [:insert, :delete, :substitute][rand(2)]
|
333
|
+
seq, d = send(method, seq)
|
334
|
+
distance += d
|
335
|
+
end
|
336
|
+
return [seq, distance]
|
337
|
+
end
|
338
|
+
|
339
|
+
def test_generated_samples
|
340
|
+
100.times do
|
341
|
+
input = sequence
|
342
|
+
output, distance = mutate(input)
|
343
|
+
a = input.map(&:to_s).join("")
|
344
|
+
b = output.map(&:to_s).join("")
|
345
|
+
assert_equal distance, Text::Levenshtein.distance(a, b)
|
346
|
+
end
|
347
|
+
end
|
348
|
+
|
349
|
+
def test_generated_samples_with_maximum_distance
|
350
|
+
100.times do
|
351
|
+
input = sequence
|
352
|
+
output, distance = mutate(input)
|
353
|
+
a = input.map(&:to_s).join("")
|
354
|
+
b = output.map(&:to_s).join("")
|
355
|
+
(0 .. distance).each do |d|
|
356
|
+
assert_equal d, Text::Levenshtein.distance(a, b, d)
|
357
|
+
end
|
358
|
+
(distance .. sequence.length).each do |d|
|
359
|
+
assert_equal distance, Text::Levenshtein.distance(a, b, d)
|
360
|
+
end
|
361
|
+
end
|
362
|
+
end
|
93
363
|
end
|