text 1.2.3 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 990ce640247dc172ad5e84a54bec8349ccdf55bb
4
+ data.tar.gz: 25c46a8bfe4fd410959be538b10b2a288e70cbcb
5
+ SHA512:
6
+ metadata.gz: 6d506f2ae153149a6ebd288ce61a540931fd5d944bc661a582ba8a1fffbdaaf454873e690308202826127d8849c3ec8892969100ad224c51cf61aa5d20fde48c
7
+ data.tar.gz: 5ffcc3933066fb5131f7da6b438f89fec87d047bc2703e5ad7b4d0ed3a433c4de8c1862c66b8ff7b6dbdcb7aa7e9646c9f8594cd3e6a0630e7a25b0b7bf8e7fb
@@ -0,0 +1,21 @@
1
+ == Licence (MIT)
2
+
3
+ Copyright (c) 2006-2013 Paul Battley, Michael Neumann, Tim Fletcher
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -12,6 +12,10 @@ A collection of text algorithms.
12
12
  # => 0
13
13
  Text::Levenshtein.distance('test', 'tent')
14
14
  # => 1
15
+ Text::Levenshtein.distance('test', 'testing')
16
+ # => 3
17
+ Text::Levenshtein.distance('test', 'testing', 2)
18
+ # => 2
15
19
 
16
20
  === Metaphone
17
21
 
data/Rakefile CHANGED
@@ -7,4 +7,10 @@ Rake::TestTask.new do |t|
7
7
  t.verbose = false
8
8
  end
9
9
 
10
+ desc "Run benchmark"
11
+ task :benchmark do |t|
12
+ system "ruby -v"
13
+ system "ruby perf/benchmark.rb"
14
+ end
15
+
10
16
  task :default => :test
@@ -5,7 +5,7 @@
5
5
  #
6
6
  # Based on Stephen Woodbridge's PHP version - http://swoodbridge.com/DoubleMetaPhone/
7
7
  #
8
- # Author: Tim Fletcher (twoggle@gmail.com)
8
+ # Author: Tim Fletcher (mail@tfletcher.com)
9
9
  #
10
10
 
11
11
  module Text # :nodoc:
@@ -16,29 +16,107 @@ module Levenshtein
16
16
 
17
17
  # Calculate the Levenshtein distance between two strings +str1+ and +str2+.
18
18
  #
19
+ # The optional argument max_distance can reduce the number of iterations by
20
+ # stopping if the Levenshtein distance exceeds this value. This increases
21
+ # performance where it is only necessary to compare the distance with a
22
+ # reference value instead of calculating the exact distance.
19
23
  #
20
- # In Ruby 1.8, +str1+ and +str2+ should be ASCII, UTF-8, or a one-byte-per
21
- # character encoding such as ISO-8859-*. They will be treated as UTF-8 if
22
- # $KCODE is set appropriately (i.e. 'u'). Otherwise, the comparison will be
23
- # performed byte-by-byte. There is no specific support for Shift-JIS or EUC
24
- # strings.
24
+ # The distance is calculated in terms of Unicode codepoints. Be aware that
25
+ # this algorithm does not perform normalisation: if there is a possibility
26
+ # of different normalised forms being used, normalisation should be performed
27
+ # beforehand.
25
28
  #
26
- # In Ruby 1.9+, the strings will be processed as UTF-8.
27
- #
28
- # When using Unicode text, be aware that this algorithm does not perform
29
- # normalisation. If there is a possibility of different normalised forms
30
- # being used, normalisation should be performed beforehand.
31
- #
32
- def distance(str1, str2)
33
- prepare =
34
- if "ruby".respond_to?(:encoding)
35
- lambda { |str| str.encode(Encoding::UTF_8).unpack("U*") }
29
+ def distance(str1, str2, max_distance = nil)
30
+ if max_distance
31
+ distance_with_maximum(str1, str2, max_distance)
32
+ else
33
+ distance_without_maximum(str1, str2)
34
+ end
35
+ end
36
+
37
+ private
38
+ def distance_with_maximum(str1, str2, max_distance) # :nodoc:
39
+ s, t = [str1, str2].sort_by(&:length).
40
+ map{ |str| str.encode(Encoding::UTF_8).unpack("U*") }
41
+ n = s.length
42
+ m = t.length
43
+ big_int = n * m
44
+ return m if n.zero?
45
+ return n if m.zero?
46
+ return 0 if s == t
47
+
48
+ # If the length difference is already greater than the max_distance, then
49
+ # there is nothing else to check
50
+ if (n - m).abs >= max_distance
51
+ return max_distance
52
+ end
53
+
54
+ # The values necessary for our threshold are written; the ones after must
55
+ # be filled with large integers since the tailing member of the threshold
56
+ # window in the bottom array will run min across them
57
+ d = (m + 1).times.map { |i|
58
+ if i < m || i < max_distance + 1
59
+ i
36
60
  else
37
- rule = $KCODE.match(/^U/i) ? "U*" : "C*"
38
- lambda { |str| str.unpack(rule) }
61
+ big_int
39
62
  end
63
+ }
64
+ x = nil
65
+ e = nil
66
+
67
+ n.times do |i|
68
+ # Since we're reusing arrays, we need to be sure to wipe the value left
69
+ # of the starting index; we don't have to worry about the value above the
70
+ # ending index as the arrays were initially filled with large integers
71
+ # and we progress to the right
72
+ if e.nil?
73
+ e = i + 1
74
+ else
75
+ e = big_int
76
+ end
77
+
78
+ diag_index = t.length - s.length + i
79
+
80
+ # If max_distance was specified, we can reduce second loop. So we set
81
+ # up our threshold window.
82
+ # See:
83
+ # Gusfield, Dan (1997). Algorithms on strings, trees, and sequences:
84
+ # computer science and computational biology.
85
+ # Cambridge, UK: Cambridge University Press. ISBN 0-521-58519-8.
86
+ # pp. 263–264.
87
+ min = [0, i - max_distance - 1].max
88
+ max = [m - 1, i + max_distance].min
89
+
90
+ (min .. max).each do |j|
91
+ # If the diagonal value is already greater than the max_distance
92
+ # then we can safety return: the diagonal will never go lower again.
93
+ # See: http://www.levenshtein.net/
94
+ if j == diag_index && d[j] >= max_distance
95
+ return max_distance
96
+ end
97
+
98
+ cost = s[i] == t[j] ? 0 : 1
99
+ x = [
100
+ d[j+1] + 1, # insertion
101
+ e + 1, # deletion
102
+ d[j] + cost # substitution
103
+ ].min
104
+
105
+ d[j] = e
106
+ e = x
107
+ end
108
+ d[m] = x
109
+ end
110
+
111
+ if x > max_distance
112
+ return max_distance
113
+ else
114
+ return x
115
+ end
116
+ end
40
117
 
41
- s, t = [str1, str2].map(&prepare)
118
+ def distance_without_maximum(str1, str2) # :nodoc:
119
+ s, t = [str1, str2].map{ |str| str.encode(Encoding::UTF_8).unpack("U*") }
42
120
  n = s.length
43
121
  m = t.length
44
122
  return m if n.zero?
@@ -45,7 +45,7 @@ module Metaphone
45
45
  [ /c(?=ia)/, 'X' ],
46
46
  [ /[st](?=i[ao])/, 'X' ],
47
47
  [ /s?c(?=[iey])/, 'S' ],
48
- [ /[cq]/, 'K' ],
48
+ [ /(ck?|q)/, 'K' ],
49
49
  [ /dg(?=[iey])/, 'J' ],
50
50
  [ /d/, 'T' ],
51
51
  [ /g(?=h[^aeiou])/, '' ],
@@ -94,4 +94,4 @@ private
94
94
  extend self
95
95
 
96
96
  end
97
- end
97
+ end
@@ -27,9 +27,9 @@ module Soundex
27
27
  # do not change the parameter "str"
28
28
  #
29
29
  def soundex_str(str)
30
+ str = str.upcase.gsub(/[^A-Z]/, "")
30
31
  return nil if str.empty?
31
32
 
32
- str = str.upcase
33
33
  last_code = get_code(str[0,1])
34
34
  soundex_code = str[0,1]
35
35
 
@@ -40,15 +40,13 @@ module Soundex
40
40
 
41
41
  if code == "0" then
42
42
  last_code = nil
43
- elsif code == nil then
44
- return nil
45
43
  elsif code != last_code then
46
44
  soundex_code += code
47
45
  last_code = code
48
46
  end
49
47
  end # for
50
48
 
51
- return soundex_code + "000"[0,4-soundex_code.size]
49
+ return soundex_code.ljust(4, "0")
52
50
  end
53
51
  module_function :soundex_str
54
52
 
@@ -1,8 +1,8 @@
1
1
  module Text
2
2
  module VERSION #:nodoc:
3
3
  MAJOR = 1
4
- MINOR = 2
5
- TINY = 3
4
+ MINOR = 3
5
+ TINY = 0
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
@@ -26,6 +26,8 @@ CHUTE: XT
26
26
  SCHUSS: SXS
27
27
  OTTO: OT
28
28
  ERIC: ERK
29
+ BUCK: BK
30
+ COCK: KK
29
31
  DAVE: TF
30
32
  CATHERINE: K0RN
31
33
  KATHERINE: K0RN
@@ -0,0 +1,14 @@
1
+ Euler: E460
2
+ Ellery: E460
3
+ Gauss: G200
4
+ Ghosh: G200
5
+ Hilbert: H416
6
+ Heilbronn: H416
7
+ Knuth: K530
8
+ Kant: K530
9
+ Lloyd: L300
10
+ Ladd: L300
11
+ Lukasiewicz: L222
12
+ Lissajous: L222
13
+ SanFrancisco: S516
14
+ "San Francisco": S516
@@ -0,0 +1,17 @@
1
+ require_relative "./test_helper"
2
+ require "text/double_metaphone"
3
+
4
+ require 'csv'
5
+
6
+ class DoubleMetaphoneTest < Test::Unit::TestCase
7
+
8
+ def test_cases
9
+ CSV.open(data_file_path('double_metaphone.csv'), 'r').to_a.each do |row|
10
+ primary, secondary = Text::Metaphone.double_metaphone(row[0])
11
+
12
+ assert_equal row[1], primary
13
+ assert_equal row[2], secondary.nil?? primary : secondary
14
+ end
15
+ end
16
+
17
+ end
@@ -1,93 +1,363 @@
1
- require "test_helper"
1
+ # coding: UTF-8
2
+
3
+ require_relative "./test_helper"
2
4
  require "text/levenshtein"
3
5
 
4
6
  class LevenshteinTest < Test::Unit::TestCase
5
-
6
7
  include Text::Levenshtein
7
8
 
8
- TEST_CASES = {
9
- :easy => [
10
- ['test', 'test', 0],
11
- ['test', 'tent', 1],
12
- ['gumbo', 'gambol', 2],
13
- ['kitten', 'sitting', 3]
14
- ],
15
- :empty => [
16
- ['foo', '', 3],
17
- ['', '', 0],
18
- ['a', '', 1]
19
- ],
20
- :utf8 => [
21
- ["f\303\266o", 'foo', 1],
22
- ["fran\303\247ais", 'francais', 1],
23
- ["fran\303\247ais", "fran\303\246ais", 1],
24
- [
25
- "\347\247\201\343\201\256\345\220\215\345\211\215\343\201\257"<<
26
- "\343\203\235\343\203\274\343\203\253\343\201\247\343\201\231",
27
- "\343\201\274\343\201\217\343\201\256\345\220\215\345\211\215\343\201"<<
28
- "\257\343\203\235\343\203\274\343\203\253\343\201\247\343\201\231",
29
- 2
30
- ] # Japanese
31
- ],
32
- :iso_8859_1 => [
33
- ["f\366o", 'foo', 1],
34
- ["fran\347ais", 'francais', 1],
35
- ["fran\347ais", "fran\346ais", 1]
36
- ],
37
- :edge => [
38
- ['a', 'a', 0],
39
- ['0123456789', 'abcdefghijklmnopqrstuvwxyz', 26]
40
- ]
41
- }
42
-
43
- def assert_set(name)
44
- TEST_CASES[name].each do |s, t, x|
45
- if defined?(Encoding) && Encoding.default_internal # Change the encoding if in 1.9
46
- t.force_encoding(Encoding.default_internal)
47
- s.force_encoding(Encoding.default_internal)
48
- end
9
+ def iso_8859_1(s)
10
+ s.force_encoding(Encoding::ISO_8859_1)
11
+ end
49
12
 
50
- assert_equal x, distance(s, t)
51
- assert_equal x, distance(t, s)
52
- end
13
+ def test_should_calculate_lengths_for_basic_examples
14
+ assert_equal 0, distance("test", "test")
15
+ assert_equal 1, distance("test", "tent")
16
+ assert_equal 2, distance("gumbo", "gambol")
17
+ assert_equal 3, distance("kitten", "sitting")
18
+ end
19
+
20
+ def test_should_give_full_distances_for_empty_strings
21
+ assert_equal 3, distance("foo", "")
22
+ assert_equal 0, distance("", "")
23
+ assert_equal 1, distance("a", "")
24
+ end
25
+
26
+ def test_should_treat_utf_8_codepoints_as_one_element
27
+ assert_equal 1, distance("föo", "foo")
28
+ assert_equal 1, distance("français", "francais")
29
+ assert_equal 1, distance("français", "franæais")
30
+ assert_equal 2, distance("私の名前はポールです", "ぼくの名前はポールです")
31
+ end
32
+
33
+ def test_should_process_single_byte_encodings
34
+ assert_equal 1, distance(iso_8859_1("f\xF6o"), iso_8859_1("foo"))
35
+ assert_equal 1, distance(iso_8859_1("fran\xE7ais"), iso_8859_1("francais"))
36
+ assert_equal 1, distance(iso_8859_1("fran\xE7ais"), iso_8859_1("fran\xE6ais"))
37
+ end
38
+
39
+ def test_should_process_edge_cases_as_expected
40
+ assert_equal 0, distance("a", "a")
41
+ assert_equal 26, distance("0123456789", "abcdefghijklmnopqrstuvwxyz")
42
+ end
43
+
44
+ def test_should_return_calculated_distance_when_less_than_maximum
45
+ assert_equal 0, distance("test", "test", 1)
46
+ assert_equal 1, distance("test", "tent", 2)
47
+ assert_equal 2, distance("gumbo", "gambol", 3)
48
+ assert_equal 3, distance("kitten", "sitting", 4)
49
+ end
50
+
51
+ def test_should_return_calculated_distance_when_same_as_maximum
52
+ assert_equal 0, distance("test", "test", 0)
53
+ assert_equal 1, distance("test", "tent", 1)
54
+ assert_equal 2, distance("gumbo", "gambol", 2)
55
+ assert_equal 3, distance("kitten", "sitting", 3)
56
+ end
57
+
58
+ def test_should_return_specified_maximum_if_distance_is_more
59
+ assert_equal 1, distance("gumbo", "gambol", 1)
60
+ assert_equal 2, distance("kitten", "sitting", 2)
61
+ assert_equal 1, distance("test", "tasf", 1)
62
+ end
63
+
64
+ def test_should_return_maximum_distance_for_strings_with_additions_at_start
65
+ assert_equal 1, distance("1234", "01234")
66
+ assert_equal 0, distance("1234", "01234", 0)
67
+ assert_equal 1, distance("1234", "01234", 1)
68
+ assert_equal 1, distance("1234", "01234", 2)
69
+ assert_equal 1, distance("1234", "01234", 3)
70
+ assert_equal 1, distance("1234", "01234", 5)
71
+ end
72
+
73
+ def test_should_return_maximum_distance_for_strings_with_additions_at_end
74
+ assert_equal 2, distance("1234", "123400")
75
+ assert_equal 0, distance("1234", "123400", 0)
76
+ assert_equal 1, distance("1234", "123400", 1)
77
+ assert_equal 2, distance("1234", "123400", 2)
78
+ assert_equal 2, distance("1234", "123400", 3)
79
+ assert_equal 2, distance("1234", "123400", 5)
80
+ end
81
+
82
+ def test_should_return_maximum_distance_for_strings_with_additions_in_the_middle
83
+ assert_equal 1, distance("1234", "12034")
84
+ assert_equal 0, distance("1234", "12034", 0)
85
+ assert_equal 1, distance("1234", "12034", 1)
86
+ assert_equal 1, distance("1234", "12034", 2)
87
+ assert_equal 1, distance("1234", "12034", 5)
88
+ end
89
+
90
+ def test_should_return_maximum_distance_for_strings_with_additions_at_start_and_in_the_middle
91
+ assert_equal 2, distance("1234", "012034")
92
+ assert_equal 0, distance("1234", "012034", 0)
93
+ assert_equal 1, distance("1234", "012034", 1)
94
+ assert_equal 2, distance("1234", "012034", 2)
95
+ assert_equal 2, distance("1234", "012034", 3)
96
+ assert_equal 2, distance("1234", "012034", 5)
97
+ end
98
+
99
+ def test_should_return_maximum_distance_for_strings_with_additions_at_end_and_in_the_middle
100
+ assert_equal 2, distance("1234", "120340")
101
+ assert_equal 0, distance("1234", "120340", 0)
102
+ assert_equal 1, distance("1234", "120340", 1)
103
+ assert_equal 2, distance("1234", "120340", 2)
104
+ assert_equal 2, distance("1234", "120340", 3)
105
+ assert_equal 2, distance("1234", "120340", 5)
106
+ end
107
+
108
+ def test_should_return_maximum_distance_for_strings_with_additions_at_start_at_end_and_in_the_middle
109
+ assert_equal 3, distance("1234", "0120340")
110
+ assert_equal 0, distance("1234", "0120340", 0)
111
+ assert_equal 3, distance("1234", "0120340", 3)
112
+ assert_equal 3, distance("1234", "0120340", 4)
113
+ assert_equal 3, distance("1234", "0120340", 6)
114
+ end
115
+
116
+ def test_should_return_maximum_distance_for_strings_with_additions_at_start_and_char_changes
117
+ assert_equal 3, distance("1234", "001233")
118
+ assert_equal 0, distance("1234", "001233", 0)
119
+ assert_equal 2, distance("1234", "001233", 2)
120
+ assert_equal 3, distance("1234", "001233", 3)
121
+ assert_equal 3, distance("1234", "001233", 4)
122
+ assert_equal 3, distance("1234", "001233", 5)
123
+ end
124
+
125
+ def test_should_return_maximum_distance_for_strings_with_deletions_at_end
126
+ assert_equal 1, distance("1234", "123")
127
+ assert_equal 0, distance("1234", "123", 0)
128
+ assert_equal 1, distance("1234", "123", 1)
129
+ assert_equal 1, distance("1234", "123", 2)
130
+ assert_equal 1, distance("1234", "123", 5)
131
+ end
132
+
133
+ def test_should_return_maximum_distance_for_strings_with_deletions_at_start
134
+ assert_equal 1, distance("1234", "234")
135
+ assert_equal 0, distance("1234", "234", 0)
136
+ assert_equal 1, distance("1234", "234", 1)
137
+ assert_equal 1, distance("1234", "234", 2)
138
+ assert_equal 1, distance("1234", "234", 5)
139
+ end
140
+
141
+ def test_should_return_maximum_distance_for_strings_with_deletions_at_start_and_in_the_middle
142
+ assert_equal 2, distance("1234", "24")
143
+ assert_equal 0, distance("1234", "24", 0)
144
+ assert_equal 1, distance("1234", "24", 1)
145
+ assert_equal 2, distance("1234", "24", 2)
146
+ assert_equal 2, distance("1234", "24", 3)
147
+ assert_equal 2, distance("1234", "24", 5)
148
+ end
149
+
150
+ def test_should_return_maximum_distance_for_strings_with_deletions_at_end_and_in_the_middle
151
+ assert_equal 2, distance("1234", "13")
152
+ assert_equal 0, distance("1234", "13", 0)
153
+ assert_equal 1, distance("1234", "13", 1)
154
+ assert_equal 2, distance("1234", "13", 2)
155
+ assert_equal 2, distance("1234", "13", 3)
156
+ assert_equal 2, distance("1234", "13", 5)
157
+ end
158
+
159
+ def test_should_return_maximum_distance_for_strings_with_deletions_at_start_at_end_and_in_the_middle
160
+ assert_equal 3, distance("12345", "24")
161
+ assert_equal 0, distance("12345", "24", 0)
162
+ assert_equal 2, distance("12345", "24", 2)
163
+ assert_equal 3, distance("12345", "24", 3)
164
+ assert_equal 3, distance("12345", "24", 4)
165
+ assert_equal 3, distance("12345", "24", 5)
166
+ end
167
+
168
+ def test_should_return_maximum_distance_for_strings_with_additions_at_start_and_deletions_in_the_middle
169
+ assert_equal 2, distance("1234", "0124")
170
+ assert_equal 0, distance("1234", "0124", 0)
171
+ assert_equal 1, distance("1234", "0124", 1)
172
+ assert_equal 2, distance("1234", "0124", 2)
173
+ assert_equal 2, distance("1234", "0124", 3)
174
+ assert_equal 2, distance("1234", "0124", 5)
175
+ end
176
+
177
+ def test_should_return_maximum_distance_for_strings_with_additions_at_start_and_deletions_at_end
178
+ assert_equal 2, distance("1234", "0123")
179
+ assert_equal 0, distance("1234", "0123", 0)
180
+ assert_equal 1, distance("1234", "0123", 1)
181
+ assert_equal 2, distance("1234", "0123", 2)
182
+ assert_equal 2, distance("1234", "0123", 3)
183
+ assert_equal 2, distance("1234", "0123", 5)
184
+ end
185
+
186
+ def test_should_return_maximum_distance_for_strings_with_additions_in_the_middle_and_deletions_at_end
187
+ assert_equal 2, distance("1234", "1293")
188
+ assert_equal 0, distance("1234", "1293", 0)
189
+ assert_equal 1, distance("1234", "1293", 1)
190
+ assert_equal 2, distance("1234", "1293", 2)
191
+ assert_equal 2, distance("1234", "1293", 3)
192
+ assert_equal 2, distance("1234", "1293", 5)
193
+ end
194
+
195
+ def test_should_return_maximum_distance_for_strings_with_additions_in_the_middle_and_deletions_at_start
196
+ assert_equal 2, distance("1234", "2934")
197
+ assert_equal 0, distance("1234", "2934", 0)
198
+ assert_equal 1, distance("1234", "2934", 1)
199
+ assert_equal 2, distance("1234", "2934", 2)
200
+ assert_equal 2, distance("1234", "2934", 3)
201
+ assert_equal 2, distance("1234", "2934", 5)
202
+ end
203
+
204
+ def test_should_return_maximum_distance_for_strings_with_additions_at_end_and_deletions_at_start
205
+ assert_equal 2, distance("1234", "2345")
206
+ assert_equal 0, distance("1234", "2345", 0)
207
+ assert_equal 1, distance("1234", "2345", 1)
208
+ assert_equal 2, distance("1234", "2345", 2)
209
+ assert_equal 2, distance("1234", "2345", 3)
210
+ assert_equal 2, distance("1234", "2345", 5)
211
+ end
212
+
213
+ def test_should_return_maximum_distance_for_strings_with_additions_at_end_and_deletions_in_the_middle
214
+ assert_equal 2, distance("1234", "1245")
215
+ assert_equal 0, distance("1234", "1245", 0)
216
+ assert_equal 1, distance("1234", "1245", 1)
217
+ assert_equal 2, distance("1234", "1245", 2)
218
+ assert_equal 2, distance("1234", "1245", 3)
219
+ assert_equal 2, distance("1234", "1245", 5)
220
+ end
221
+
222
+ def test_should_return_maximum_distance_for_strings_with_additions_in_the_middle_and_deletions_in_the_middle
223
+ assert_equal 2, distance("12345", "12035")
224
+ assert_equal 0, distance("12345", "12035", 0)
225
+ assert_equal 1, distance("12345", "12035", 1)
226
+ assert_equal 2, distance("12345", "12035", 2)
227
+ assert_equal 2, distance("12345", "12035", 3)
228
+ assert_equal 2, distance("12345", "12035", 5)
229
+ end
230
+
231
+ def test_should_return_maximum_distance_for_strings_with_additions_deletions_and_char_changes
232
+ assert_equal 3, distance("1234", "0193")
233
+ assert_equal 0, distance("1234", "0193", 0)
234
+ assert_equal 1, distance("1234", "0193", 1)
235
+ assert_equal 2, distance("1234", "0193", 2)
236
+ assert_equal 3, distance("1234", "0193", 3)
237
+ assert_equal 3, distance("1234", "0193", 4)
238
+ assert_equal 3, distance("1234", "0193", 5)
239
+
240
+ assert_equal 3, distance("1234", "2395")
241
+ assert_equal 0, distance("1234", "2395", 0)
242
+ assert_equal 1, distance("1234", "2395", 1)
243
+ assert_equal 2, distance("1234", "2395", 2)
244
+ assert_equal 3, distance("1234", "2395", 3)
245
+ assert_equal 3, distance("1234", "2395", 4)
246
+ assert_equal 3, distance("1234", "2395", 5)
247
+ end
248
+
249
+ def test_should_return_maximum_distance_for_strings_with_only_one_char
250
+ assert_equal 1, distance("t", "a")
251
+ assert_equal 0, distance("t", "a", 0)
252
+ assert_equal 1, distance("t", "a", 1)
253
+ assert_equal 1, distance("t", "a", 2)
254
+ assert_equal 1, distance("t", "a", 10)
255
+
256
+ assert_equal 0, distance("t", "t")
257
+ assert_equal 0, distance("t", "t", 1)
258
+ assert_equal 0, distance("t", "t", 4)
259
+
260
+ assert_equal 1, distance("te", "t")
261
+ assert_equal 0, distance("te", "t", 0)
262
+ assert_equal 1, distance("te", "t", 1)
263
+ assert_equal 1, distance("te", "t", 2)
264
+ assert_equal 1, distance("te", "t", 4)
265
+ end
266
+
267
+ def test_should_return_maximum_distance_for_a_long_string
268
+ assert_equal 440, distance( "Having a catchy name, easy reminder for all is fundamental when choosing the name for a new product. A bad name can be the beginning of the end product and immediately forget this.</p> <p>Primary keys to choose a good brand name are, first: choose a name that only has one word and at most three, such being the optimum. Try to make it easier to read and pronounce, as this will be easier to remember for all the time to talk about your product. Remember, too, that the use of capitalization also influence, you should treat the name of your product as if it were the same logo. And finally, you should avoid using numbers in your product name, unless it is a very easy to remember because this number were tied deeply with your product. Always think globally, independent of which only sell locally, you never know when it can come out in sales and need to make a point.",
269
+ "All product lines work with tags that identify its products and differentiate it from the others or with labels for packaged, or perhaps labels to be placed in the envelopes that you send to your customers. There are thousands options, shapes, designs and colors that you can use and advantage of these is that they can also be adhesive. If you need a label that serve you and that you identify will have your order. You will receive many proposals that you can discard if they don't like you or you keep it if you like and fits your needs. Don't miss the opportunity to innovate and use all the tools that allow you to continue to grow as a company. REMEMBER! a good label, with a good design can increase your sales by 20% just by its appearance.",
270
+ 440 )
53
271
  end
54
272
 
55
- def with_encoding(kcode, encoding)
56
- if "ruby".respond_to?(:encoding)
57
- old_encoding = Encoding.default_internal
58
- Encoding.default_internal = encoding
59
- yield
60
- Encoding.default_internal = old_encoding
61
- else # 1.8 backwards compat
62
- old_kcode = $KCODE
63
- $KCODE = kcode
64
- yield
65
- $KCODE = old_kcode
273
+ end
274
+
275
+ class LevenshteinGeneratedDataTest < Test::Unit::TestCase
276
+ Element = Struct.new(:char, :added) do
277
+ def to_s
278
+ char
66
279
  end
67
280
  end
68
281
 
69
- def test_easy_cases
70
- assert_set(:easy)
282
+ def one_of(str)
283
+ str[rand(str.length)]
284
+ end
285
+
286
+ def letter
287
+ one_of "abcdefghijklmnopqrstuvwxyzáéíóúあいうえお日月火水木"
288
+ end
289
+
290
+ def word
291
+ (rand(10) + 2).times.map { letter }.join("")
292
+ end
293
+
294
+ def sentence
295
+ (rand(10) + 2).times.map { word }.join(" ")
71
296
  end
72
297
 
73
- def test_empty_cases
74
- assert_set(:empty)
298
+ def sequence
299
+ sentence.scan(/./).map { |c| Element.new(c, true) }
75
300
  end
76
301
 
77
- def test_edge_cases
78
- assert_set(:edge)
302
+ def insert(seq)
303
+ elem = Element.new(letter, true)
304
+ pos = rand(seq.length)
305
+ return [seq[0, pos] + [elem] + seq[pos .. -1], 1]
79
306
  end
80
307
 
81
- def test_utf8_cases
82
- with_encoding('U', 'UTF-8') do
83
- assert_set(:utf8)
308
+ # Delete an element, but only if we didn't add it - that would make the
309
+ # calculations complicated
310
+ def delete(seq)
311
+ pos = rand(seq.length)
312
+ if seq[pos].added
313
+ return [seq, 0]
314
+ else
315
+ return [seq[0, pos] + seq[(pos + 1) .. -1], 1]
84
316
  end
85
317
  end
86
318
 
87
- def test_iso_8859_1_cases
88
- with_encoding('NONE', 'ISO-8859-1') do
89
- assert_set(:iso_8859_1)
319
+ def substitute(seq)
320
+ pos = rand(seq.length)
321
+ if seq[pos].added
322
+ return [seq, 0]
323
+ else
324
+ elem = Element.new(letter, false)
325
+ return [seq[0, pos] + [elem] + se[(pos + 1) .. -1], 1]
90
326
  end
91
327
  end
92
328
 
329
+ def mutate(seq)
330
+ distance = 0
331
+ rand(seq.length).times do
332
+ method = [:insert, :delete, :substitute][rand(2)]
333
+ seq, d = send(method, seq)
334
+ distance += d
335
+ end
336
+ return [seq, distance]
337
+ end
338
+
339
+ def test_generated_samples
340
+ 100.times do
341
+ input = sequence
342
+ output, distance = mutate(input)
343
+ a = input.map(&:to_s).join("")
344
+ b = output.map(&:to_s).join("")
345
+ assert_equal distance, Text::Levenshtein.distance(a, b)
346
+ end
347
+ end
348
+
349
+ def test_generated_samples_with_maximum_distance
350
+ 100.times do
351
+ input = sequence
352
+ output, distance = mutate(input)
353
+ a = input.map(&:to_s).join("")
354
+ b = output.map(&:to_s).join("")
355
+ (0 .. distance).each do |d|
356
+ assert_equal d, Text::Levenshtein.distance(a, b, d)
357
+ end
358
+ (distance .. sequence.length).each do |d|
359
+ assert_equal distance, Text::Levenshtein.distance(a, b, d)
360
+ end
361
+ end
362
+ end
93
363
  end